///////////////////////////////////////////////////////////////////////////////////////
// File   : convol.c  
// Date   : june 2014
// author : Alain Greiner
///////////////////////////////////////////////////////////////////////////////////////
// This multi-threaded application implements a 2D convolution product.  
// It can run on a multi-cores, multi-clusters architecture, with one thread
// per core, and uses the POSIX threads API.
// 
// The input image is read from a file and the output image is saved to another file.
//
// - number of clusters containing processors must be power of 2 no larger than 256.
// - number of processors per cluster must be power of 2 no larger than 4.
// - number of working threads is the number of cores availables in the hardware
//   architecture : nthreads = nclusters * ncores.
//
// The convolution kernel is defined in the execute() function.
// It can be factored in two independant line and column convolution products.
//
// The main() function can be launched on any processor.
// - It checks software requirements versus the hardware resources.
// - It open & maps the input file to a global <image_in> buffer.
// - it open & maps the output file to another global <image_out> buffer.
// - it open the instrumentation file.
// - it creates & activates two FBF windows to display input & output images.
// - it launches other threads to run in parallel the execute() function.
// - it saves the instrumentation results on disk.
// - it closes the input, output, & instrumentation files.
// - it deletes the FBF input & output windows.
//
// The execute() function is executed in parallel by all threads. These threads are
// working on 5 arrays of distributed buffers, indexed by the cluster index [cid].
// - A[cid]: contain the distributed initial image (NL/NCLUSTERS lines per cluster).
// - B[cid]: is the result of horizontal filter, then transpose B <= Trsp(HF(A)
// - C[cid]: is the result of vertical image, then transpose : c <= Trsp(VF(B)
// - D[cid]: is the the difference between A and FH(A) : D <= A - FH(A)
// - Z[cid]: contain the distributed final image Z <= C + D
//
// It can be split in four phases separated by synchronisation barriers:
// 1. Initialisation:
//    Allocates the 5 A[cid],B[cid],C[cid],D[cid],Z[cid] buffers, initialise A[cid]
//    from the <image_in> buffer, and display the initial image on FBF if rquired.
// 2. Horizontal Filter:
//    Set B[cid] and D[cid] from A[cid]. Read data accesses are local, write data
//    accesses are remote, to implement the transpose. 
// 3. Vertical Filter:  
//    Set C[cid] from B[cid]. Read data accesses are local, write data accesses 
//    are remote, to implement the transpose.
// 4. Save results:
//    Set the Z[cid] from C[cid] and D[cid]. All read and write access are local.
//    Move the final image (Z[cid] buffer) to the <image_out> buffer.   
//
// This application supports three placement modes, implemented in the main() function.
// In all modes, the working threads are identified by the [tid] continuous index 
// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
// This continuous index can always be decomposed in two continuous sub-indexes:
// tid == cid * NCORES + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
//
// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
//   threads are created by the main thread, but the placement is done by the OS, using
//   the DQDT for load balancing, and two working threads can be placed on the same core.
//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
//   cluster or a physical core. In this mode, the main thread run on any cluster, 
//   but has tid = 0 (i.e. cid = 0 & tid = 0).
//
// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement 
//   of the threads on the cores is explicitely controled by the main thread to have
//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
//   physical cluster identifier, and [lid] is the local core index.
//
// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
//   non standard pthread_parallel_create() function to avoid the costly sequencial
//   loops for pthread_create() and pthread_join(). It garanties one working thread 
//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
//
// The [tid] continuous index defines how the work is shared amongst the threads:
// - each thread handles NL/nthreads lines for the horizontal filter.
// - each thread handles NP/nthreads columns for the vertical filter.
///////////////////////////////////////////////////////////////////////////////////////

#include <sys/mman.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <pthread.h>
#include <string.h>
#include <almosmkh.h>
#include <hal_macros.h>

#define VERBOSE_MAIN               1
#define VERBOSE_EXEC               1
#define SUPER_VERBOSE              0

#define X_MAX                      16
#define Y_MAX                      16
#define CORES_MAX                  4
#define CLUSTERS_MAX               (X_MAX * Y_MAX)
#define THREADS_MAX                (X_MAX * Y_MAX * CORES_MAX)

#define IMAGE_TYPE                 420                         // pixel encoding type
#define INPUT_IMAGE_PATH           "misc/couple_512.raw"       // default image_in
#define OUTPUT_IMAGE_PATH          "misc/couple_conv_512.raw"  // default image_out
#define NL                         512                         // default nlines
#define NP                         512                         // default npixels

#define NO_PLACEMENT               0
#define EXPLICIT_PLACEMENT         0
#define PARALLEL_PLACEMENT         1

#define INTERACTIVE_MODE           0
#define USE_DQT_BARRIER            1
#define INITIAL_DISPLAY_ENABLE     1
#define FINAL_DISPLAY_ENABLE       1

#define TA(c,l,p)  (A[c][((NP) * (l)) + (p)])
#define TB(c,p,l)  (B[c][((NL) * (p)) + (l)])
#define TC(c,l,p)  (C[c][((NP) * (l)) + (p)])
#define TD(c,l,p)  (D[c][((NP) * (l)) + (p)])
#define TZ(c,l,p)  (Z[c][((NP) * (l)) + (p)])

#define max(x,y) ((x) > (y) ? (x) : (y))
#define min(x,y) ((x) < (y) ? (x) : (y))

//////////////////////////////////////////////////////////
//            global variables 
//////////////////////////////////////////////////////////

// global instrumentation counters for the main thread
unsigned int SEQUENCIAL_TIME = 0;
unsigned int PARALLEL_TIME   = 0;

// instrumentation counters for thread[tid] in cluster[cid] 
unsigned int START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int H_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int H_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int V_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int V_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int F_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
unsigned int F_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};

// pointer on buffer containing the input image, maped by the main to the input file
unsigned char *  image_in;

// pointer on buffer containing the output image, maped by the main to the output file
unsigned char *  image_out;

// return values at thread exit
unsigned int THREAD_EXIT_SUCCESS = 0;
unsigned int THREAD_EXIT_FAILURE = 1;

// pointer and identifier for FBF windows 
void   *  in_win_buf;
int       in_wid;
void   *  out_win_buf;
int       out_wid;

// synchronization barrier
pthread_barrier_t     barrier;

// platform parameters
unsigned int  x_size;              // number of clusters in a row
unsigned int  y_size;              // number of clusters in a column
unsigned int  ncores;              // number of processors per cluster

// main thread continuous index
unsigned int     tid_main;

// arrays of pointers on distributed buffers in all clusters
unsigned char  * GA[CLUSTERS_MAX];
int            * GB[CLUSTERS_MAX];
int            * GC[CLUSTERS_MAX];
int            * GD[CLUSTERS_MAX];
unsigned char  * GZ[CLUSTERS_MAX];

// array of threads kernel identifiers / indexed by [tid] 
pthread_t        exec_trdid[THREADS_MAX];

// array of threads attributes / indexed bi [tid]
pthread_attr_t   exec_attr[THREADS_MAX]; 

// array of execute() function arguments / indexed by [tid]
pthread_parallel_work_args_t exec_args[THREADS_MAX];

// image features
unsigned int   image_nl;
unsigned int   image_np;
char           input_image_path[128];
char           output_image_path[128];

/////////////////////////////////////////////////////////////////////////////////////
//           functions declaration
/////////////////////////////////////////////////////////////////////////////////////

void * execute( void * args );

void instrument( FILE * f , char * filename );

/////////////////
void main( void )
/////////////////
{
    unsigned long long start_cycle;
    unsigned long long end_sequencial_cycle;
    unsigned long long end_parallel_cycle;

    int          error;

    char         instru_name[32];               // instrumentation file name
    char         instru_path[64];               // instrumentation path name

    /////////////////////////////////////////////////////////////////////////////////
    get_cycle( &start_cycle );
    /////////////////////////////////////////////////////////////////////////////////

    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
    {
        printf("\n[convol error] illegal placement\n");
        exit( 0 );
    }

    // get & check platform parameters
    hard_config_t  config;
    get_config( &config );
    x_size = config.x_size;
    y_size = config.y_size;
    ncores = config.ncores;

    if((ncores != 1) && (ncores != 2) && (ncores != 4))
    {
        printf("\n[convol error] number of cores per cluster must be 1/2/4\n");
        exit( 0 );
    }

    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 
        (x_size != 8) && (x_size != 16) )
    {
        printf("\n[convol error] x_size must be 1/2/4/8/16\n");
        exit( 0 );
    }
        
    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 
        (y_size != 8) && (y_size != 16) )
    {
        printf("\n[convol error] y_size must be 1/2/4/8/16\n");
        exit( 0 );
    }
        
    // main thread get identifiers for core executing main
    unsigned int  cxy_main;
    unsigned int  lid_main;
    get_core_id( &cxy_main , &lid_main );

    // compute nthreads and nclusters
    unsigned int nclusters = x_size * y_size;
    unsigned int nthreads  = nclusters * ncores;

    // get input and output images pathnames and size
    if( INTERACTIVE_MODE )
    {
        // get image size
        printf("\n[convol] image nlines      : ");
        get_uint32( &image_nl );

        printf("\n[convol] image npixels     : ");
        get_uint32( &image_np );

        printf("\n[convol] input image path  : ");
        get_string( input_image_path , 128 );

        printf("[convol] output image path : ");
        get_string( output_image_path , 128 );
    }
    else
    {
        image_nl = NL;
        image_np = NP;
        strcpy( input_image_path  , INPUT_IMAGE_PATH );
        strcpy( output_image_path , OUTPUT_IMAGE_PATH );
    }

    // main thread get FBF size and type
    int   fbf_width;
    int   fbf_height;
    int   fbf_type;
    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );

    if( ((unsigned int)fbf_width  < image_np) || 
        ((unsigned int)fbf_height < image_nl) || 
        (fbf_type != IMAGE_TYPE) )
    {
        printf("\n[convol error] image not acceptable\n"
               "FBF width  = %d / npixels  = %d\n"
               "FBF height = %d / nlines   = %d\n"
               "FBF type   = %d / expected = %d\n",
               fbf_width, image_np, fbf_height, image_nl, fbf_type, IMAGE_TYPE );
        exit( 0 );
    }

    if( nthreads > image_nl )
    {
        printf("\n[convol error] nthreads (%d] larger than nlines (%d)\n",
        nthreads , image_nl );
        exit( 0 );
    }

    // define instrumentation file name
    if( NO_PLACEMENT )
    {
        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n",
        nclusters, ncores, fbf_width, fbf_height, getpid() );

        // build instrumentation file name
        if( USE_DQT_BARRIER )
        snprintf( instru_name , 32 , "dqt_no_place_%d_%d", x_size * y_size , ncores );
        else
        snprintf( instru_name , 32 , "smp_no_place_%d_%d", x_size * y_size , ncores );
    }

    if( EXPLICIT_PLACEMENT )
    {
        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n",
        nclusters, ncores, fbf_width, fbf_height, getpid() );

        // build instrumentation file name
        if( USE_DQT_BARRIER )
        snprintf( instru_name , 32 , "dqt_explicit_%d_%d", x_size * y_size , ncores );
        else
        snprintf( instru_name , 32 , "smp_explicit_%d_%d", x_size * y_size , ncores );
    }

    if( PARALLEL_PLACEMENT )
    {
        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n",
        nclusters, ncores, fbf_width, fbf_height, getpid() );

        // build instrumentation file name
        if( USE_DQT_BARRIER )
        snprintf( instru_name , 32 , "dqt_parallel_%d_%d", x_size * y_size , ncores );
        else
        snprintf( instru_name , 32 , "smp_parallel_%d_%d", x_size * y_size , ncores );
    }

    // open instrumentation file
    snprintf( instru_path , 64 , "/home/convol/%s", instru_name );
    FILE * f_instru = fopen( instru_path , NULL );
    if ( f_instru == NULL ) 
    { 
        printf("\n[convol error] cannot open instrumentation file %s\n", instru_path );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[convol] main on core[%x,%d] open instrumentation file %s\n",
cxy_main, lid_main, instru_path );
#endif

    // main create an FBF window for input image
    in_wid = fbf_create_window( 0,                   // l_zero
                                0,                   // p_zero
                                image_nl,            // lines
                                image_np,            // pixels
                                &in_win_buf );
    if( in_wid < 0 ) 
    {
        printf("\n[transpose error] cannot open FBF window for %s\n",
        input_image_path);
        exit( 0 );
    }

    // activate window
    error = fbf_active_window( in_wid , 1 );

    if( error )
    {
        printf("\n[transpose error] cannot activate window for %s\n",
        input_image_path );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[convol] main on core[%x,%d] created FBF window (wid %d) for <%s>\n",
cxy_main, lid_main, in_wid, input_image_path );
#endif

    // main create an FBF window for output image
    out_wid = fbf_create_window( 0,                   // l_zero
                                 image_np,            // p_zero
                                 image_nl,            // lines
                                 image_np,            // pixels
                                 &out_win_buf );
    if( out_wid < 0 ) 
    {
        printf("\n[transpose error] cannot create FBF window for %s\n",
        output_image_path);
        exit( 0 );
    }

    // activate window
    error = fbf_active_window( out_wid , 1 );

    if( error )
    {
        printf("\n[transpose error] cannot activate window for %s\n",
        output_image_path );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[convol] main on core[%x,%d] created FBF window (wid %d) for <%s>\n",
cxy_main, lid_main, out_wid, output_image_path );
#endif

    // main initialise barrier 
    if( USE_DQT_BARRIER )
    {
        pthread_barrierattr_t attr;
        attr.x_size   = x_size;
        attr.y_size   = y_size;
        attr.nthreads = ncores;
        error = pthread_barrier_init( &barrier, &attr , nthreads );
    }
    else
    {
        error = pthread_barrier_init( &barrier, NULL , nthreads );
    }

    if( error )
    {
        printf("\n[convol error] cannot initialize barrier\n");
        exit( 0 );
    }

#if VERBOSE_MAIN
printf("\n[convol] main on core[%x,%d] completed barrier init\n", 
cxy_main, lid_main );
#endif

    // main open input file
    int fd_in = open( input_image_path , O_RDONLY , 0 );

    if ( fd_in < 0 ) 
    { 
        printf("\n[convol error] cannot open input file <%s>\n", input_image_path );
        exit( 0 );
    }

    // main thread map input file to image_in buffer 
    image_in = (unsigned char *)mmap( NULL,
                                      image_np * image_nl,
                                      PROT_READ,
                                      MAP_FILE | MAP_SHARED,
                                      fd_in,
                                      0 );           // offset
    if ( image_in == NULL ) 
    { 
        printf("\n[convol error] main cannot map buffer to file %s\n", input_image_path );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[convol] main on core[%x,%x] map <image_in> buffer to file <%s>\n",
cxy_main, lid_main, input_image_path );
#endif

    // main thread open output file
    int fd_out = open( output_image_path , O_CREAT , 0 ); 

    if ( fd_out < 0 ) 
    { 
        printf("\n[convol error] main cannot open file %s\n", output_image_path );
        exit( 0 );
    }

    // main thread map image_out buffer to output file
    image_out = (unsigned char *)mmap( NULL,
                                       image_np * image_nl,
                                       PROT_WRITE,
                                       MAP_FILE | MAP_SHARED,
                                       fd_out,
                                       0 );     // offset 
    if ( image_out == NULL ) 
    { 
        printf("\n[convol error] main cannot map buffer to file %s\n", output_image_path );
        exit( 0 );
    }

#if  VERBOSE_MAIN
printf("\n[convol] main on core[%x,%x] map <image_out> buffer to file <%s>\n",
cxy_main, lid_main, output_image_path );
#endif

    /////////////////////////////////////////////////////////////////////////////////////
    get_cycle( &end_sequencial_cycle );
    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
    /////////////////////////////////////////////////////////////////////////////////////

//////////////////
#if NO_PLACEMENT
{
    // the tid value for the main thread is always 0
    // main thread creates other threads with tid in [1,nthreads-1]  
    unsigned int tid;
    for ( tid = 0 ; tid < nthreads ; tid++ )
    {
        // register tid value in exec_args[tid] array
        exec_args[tid].tid = tid;
          
        // create other threads
        if( tid > 0 )
        {
            if ( pthread_create( &exec_trdid[tid], 
                                 NULL,                  // no attribute
                                 &execute,
                                 &exec_args[tid] ) ) 
            {
                printf("\n[convol error] cannot create thread %d\n", tid );
                exit( 0 );
            }

#if VERBOSE_MAIN
printf("\n[convol] main created thread %d\n", tid );
#endif

        }
        else
        {
            tid_main = 0;
        }
    }  // end for tid

    // main thread calls itself the execute() function
    execute( &exec_args[0] );

    // main thread wait other threads completion
    for ( tid = 1 ; tid < nthreads ; tid++ )
    {
        unsigned int * status;

        // main wait thread[tid] status
        if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
        {
            printf("\n[convol error] main cannot join thread %d\n", tid );
            exit( 0 );
        }
       
        // check status
        if( *status != THREAD_EXIT_SUCCESS )
        {
            printf("\n[convol error] thread %x returned failure\n", tid );
            exit( 0 );
        }

#if VERBOSE_MAIN 
printf("\n[convol] main successfully joined thread %x\n", tid );
#endif
        
    }  // end for tid
}  
#endif // end no_placement

//////////////////////
#if EXPLICIT_PLACEMENT
{
    // main thread places each other threads on a specific core[cxy][lid]
    // but the actual thread creation is sequencial
    unsigned int x;
    unsigned int y;
    unsigned int l;
    unsigned int cxy;                   // cluster identifier
    unsigned int tid;                   // thread continuous index

    for( x = 0 ; x < x_size ; x++ )
    {
        for( y = 0 ; y < y_size ; y++ )
        {
            cxy = HAL_CXY_FROM_XY( x , y );
            for( l = 0 ; l < ncores ; l++ )
            {
                // compute thread continuous index
                tid = (((x  * y_size) + y) * ncores) + l;

                // register tid value in exec_args[tid] array
                exec_args[tid].tid = tid;

                // no thread created on the core running the main
                if( (cxy != cxy_main) || (l != lid_main) )
                {
                    // define thread attributes
                    exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
                                                PT_ATTR_CORE_DEFINED;
                    exec_attr[tid].cxy        = cxy;
                    exec_attr[tid].lid        = l;
 
                    // create thread[tid] on core[cxy][l]
                    if ( pthread_create( &exec_trdid[tid],    
                                         &exec_attr[tid],    
                                         &execute,
                                         &exec_args[tid] ) )       
                    {
                        printf("\n[convol error] cannot create thread %d\n", tid );
                        exit( 0 );
                    }
#if VERBOSE_MAIN 
printf("\n[convol] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
#endif
                }
                else
                {
                    tid_main = tid;
                }
            }
        }
    }

    // main thread calls itself the execute() function
    execute( &exec_args[tid_main] );

    // main thread wait other threads completion
    for( tid = 0 ; tid < nthreads ; tid++ )
    {
        // no other thread on the core running the main
        if( tid != tid_main )
        {
            unsigned int * status;

            // wait thread[tid]
            if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
            {
                printf("\n[convol error] main cannot join thread %d\n", tid );
                exit( 0 );
            }
     
            // check status
            if( *status != THREAD_EXIT_SUCCESS )
            {
                printf("\n[convol error] thread %d returned failure\n", tid );
                exit( 0 );
            }
#if VERBOSE_MAIN 
printf("\n[convol] main joined thread %d on core[%x,%d]\n", tid , cxy , l );
#endif
        }
    }
} 
#endif   // end explicit_placement

//////////////////////
#if PARALLEL_PLACEMENT
{
    // compute covering DQT size an level
    unsigned int z          = (x_size > y_size) ? x_size : y_size;
    unsigned int root_level = ((z == 1) ? 0 : 
                              ((z == 2) ? 1 : 
                              ((z == 4) ? 2 : 
                              ((z == 8) ? 3 : 4))));

    // create & execute the working threads
    if( pthread_parallel_create( root_level , &execute ) )
    {
        printf("\n[convol error] in %s\n", __FUNCTION__ );
        exit( 0 );
    }
}
#endif  // end parallel_placement

    /////////////////////////////////////////////////////////////////////////////
    get_cycle( &end_parallel_cycle );
    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
    /////////////////////////////////////////////////////////////////////////////

    // main thread register instrumentation results
    instrument( f_instru , instru_name );

#if VERBOSE_MAIN 
printf("\n[convol] main registered instrumentation info\n" );
#endif

    // main thread close input file
    close( fd_in );

#if VERBOSE_MAIN 
printf("\n[convol] main closed input file\n" );
#endif

    // main thread close output file
    close( fd_out );

#if VERBOSE_MAIN 
printf("\n[convol] main closed output file\n" );
#endif

    // main thread close instrumentation file
    fclose( f_instru );

#if VERBOSE_MAIN 
printf("\n[convol] main closed instrumentation file\n" );
#endif

    // ask confirm for exit
    if( INTERACTIVE_MODE )
    {
        char byte;
        printf("\n[convol] press any key to to delete FBF windows and exit\n");
        getc( &byte );
    }
  
    // main thread delete FBF windows
    fbf_delete_window( in_wid );
    fbf_delete_window( out_wid );

#if VERBOSE_MAIN 
printf("\n[convol] main deleted FBF windows\n" );
#endif

    // main thread suicide 
    exit( 0 );
    
} // end main() 


//////////////////////////////////
void * execute( void * arguments )
//////////////////////////////////
{
    unsigned long long date;

    pthread_parallel_work_args_t * args = (pthread_parallel_work_args_t *)arguments;

    // Each thread initialises the convolution kernel parameters in local stack.
    // The values defined in the next 12 lines are Philips proprietary information.

    int   vnorm  = 115;
    int   vf[35] = { 1, 1, 2, 2, 2,
                     2, 3, 3, 3, 4,
                     4, 4, 4, 5, 5,
                     5, 5, 5, 5, 5,
                     5, 5, 4, 4, 4,
                     4, 3, 3, 3, 2,
                     2, 2, 2, 1, 1 };

    unsigned int hrange = 100;
    unsigned int hnorm  = 201;

    // WARNING 
    //A thread is identified by the tid index, defined in the "args" structure.
    // This index being in range [0,nclusters*ncores-1] we can always write
    //       tid == cid * ncores + lid 
    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
    // if NO_PLACEMENT, there is no relation between these
    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]

    // get thread abstract identifiers[cid,lid]  from tid
    unsigned int tid = args->tid;
    unsigned int cid = tid / ncores;   
    unsigned int lid = tid % ncores;

#if VERBOSE_EXEC
unsigned int cxy;              // core cluster identifier
unsigned int lpid;             // core local identifier
get_cycle( &date );
get_core_id( &cxy , &lpid );
printf("\n[convol] exec[%d] on core[%x,%d] enters parallel exec / cycle %d\n",
tid , cxy , lpid , (unsigned int)date );
#endif

    // compute nthreads and nclusters from global variables
    unsigned int nclusters = x_size * y_size;
    unsigned int nthreads  = nclusters * ncores;

    // indexes for loops
    unsigned int c;                 // cluster index 
    unsigned int l;                 // line index 
    unsigned int p;                 // pixel index 
    unsigned int z;                 // vertical filter index 

    unsigned int lines_per_thread   = image_nl / nthreads;
    unsigned int lines_per_cluster  = image_nl / nclusters;
    unsigned int pixels_per_thread  = image_np / nthreads;
    unsigned int pixels_per_cluster = image_np / nclusters;

    // compute number of pixels stored in one cluster 
    unsigned int local_pixels = image_nl * image_np / nclusters;        

    get_cycle( &date );
    START[cid][lid] = (unsigned int)date;

    // Each thread[cid][0] allocates 5 buffers local cluster cid 
    // and registers these 5 pointers in the global arrays
    if ( lid == 0 )
    {
        GA[cid] = malloc( local_pixels * sizeof( unsigned char ) );
        GB[cid] = malloc( local_pixels * sizeof( int ) );
        GC[cid] = malloc( local_pixels * sizeof( int ) );
        GD[cid] = malloc( local_pixels * sizeof( int ) );
        GZ[cid] = malloc( local_pixels * sizeof( unsigned char ) );

        if( (GA[cid] == NULL) || 
            (GB[cid] == NULL) || 
            (GC[cid] == NULL) || 
            (GD[cid] == NULL) || 
            (GZ[cid] == NULL) )
        {
            printf("\n[convol error] thread[%d] cannot allocate buf_in\n", tid );
            pthread_exit( &THREAD_EXIT_FAILURE );
        }

#if VERBOSE_EXEC
get_cycle( &date );
printf("\n[convol] exec[%d] on core[%x,%d] allocated shared buffers / cycle %d\n"
" GA %x / GB %x / GC %x / GD %x / GZ %x\n",
tid, cxy , lpid, (unsigned int)date, GA[cid], GB[cid], GC[cid], GD[cid], GZ[cid] );
#endif
    
    }

    ////////////////////////////////
    pthread_barrier_wait( &barrier );

    // Each thread[tid] allocates and initialises in its private stack 
    // a copy of the arrays of pointers on the distributed buffers.
    unsigned char  * A[CLUSTERS_MAX];
    int            * B[CLUSTERS_MAX];
    int            * C[CLUSTERS_MAX];
    int            * D[CLUSTERS_MAX];
    unsigned char  * Z[CLUSTERS_MAX];

    for( c = 0 ; c < nclusters ; c++ )
    {
        A[c] = GA[c];
        B[c] = GB[c];
        C[c] = GC[c];
        D[c] = GD[c];
        Z[c] = GZ[c];
    }

    unsigned int npixels  = image_np * lines_per_thread;     // pixels moved by any thread
    unsigned int g_offset = npixels * tid;             // offset in global buffer for tid
    unsigned int l_offset = npixels * lid;             // offset in local buffer for tid

    // min and max line indexes handled by thread[tid] for a global buffer
    unsigned int global_lmin = tid * lines_per_thread;   
    unsigned int global_lmax = global_lmin + lines_per_thread;  

    // min and max line indexes handled by thread[tid] for a local buffer
    unsigned int local_lmin  = lid * lines_per_thread;   
    unsigned int local_lmax  = local_lmin + lines_per_thread;  

    // pmin and pmax pixel indexes handled by thread[tid] in a column
    unsigned int column_pmin = tid * pixels_per_thread;  
    unsigned int column_pmax = column_pmin + pixels_per_thread; 

    // Each thread[tid] copy npixels from image_in buffer to local A[cid] buffer
    memcpy( A[cid]   + l_offset,
            image_in + g_offset,
            npixels );
 
#if VERBOSE_EXEC
get_cycle( &date );
printf( "\n[convol] exec[%d] on core[%x,%d] loaded input file in A[%d] / cycle %d\n", 
tid , cxy , lpid , cid , (unsigned int)date);
#endif

    // Optionnal parallel display for the initial image 
    if ( INITIAL_DISPLAY_ENABLE )
    {
        // each thread[tid] copy npixels from A[cid] to in_win_buf buffer
        memcpy( in_win_buf + g_offset,
                A[cid]     + l_offset,
                npixels );

        // refresh the FBF window 
        if( fbf_refresh_window( in_wid , global_lmin , global_lmax ) ) 
        {
            printf("\n[convol error] in %s : thread[%d] cannot access FBF\n",
            __FUNCTION__ , tid );
            pthread_exit( &THREAD_EXIT_FAILURE );
        }

#if VERBOSE_EXEC 
get_cycle( &date );
printf( "\n[convol] exec[%d] on core[%x,%d] completed initial display / cycle %d\n",
tid , cxy , lpid , (unsigned int)date );
#endif

        ////////////////////////////////
        pthread_barrier_wait( &barrier );
    }

    ////////////////////////////////////////////////////////////
    // parallel horizontal filter : 
    // B <= Transpose(FH(A))
    // D <= A - FH(A)
    // Each thread computes (image_nl/nthreads) lines.
    // The image must be extended :
    // if (z<0)    TA(cid,l,z) == TA(cid,l,0)
    // if (z>image_np-1) TA(cid,l,z) == TA(cid,l,image_np-1)
    ////////////////////////////////////////////////////////////

    get_cycle( &date );
    H_BEG[cid][lid] = (unsigned int)date;

    // l = global line index / p = absolute pixel index  

    for (l = global_lmin; l < global_lmax; l++)
    {
        // src_c and src_l are the cluster index and the line index for A & D
        int src_c = l / lines_per_cluster;
        int src_l = l % lines_per_cluster;

        // We use the specific values of the horizontal ep-filter for optimisation:
        // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1]
        // To minimize the number of tests, the loop on pixels is split in three domains 

        int sum_p = (hrange + 2) * TA(src_c, src_l, 0);
        for (z = 1; z < hrange; z++)
        {
            sum_p = sum_p + TA(src_c, src_l, z);
        }

        // first domain : from 0 to hrange
        for (p = 0; p < hrange + 1; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p / pixels_per_cluster;
            int dst_p = p % pixels_per_cluster;
            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, 0);
            TB(dst_c, dst_p, l) = sum_p / hnorm;
            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
        }
        // second domain : from (hrange+1) to (image_np-hrange-1)
        for (p = hrange + 1; p < image_np - hrange; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p / pixels_per_cluster;
            int dst_p = p % pixels_per_cluster;
            sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) 
                          - (int) TA(src_c, src_l, p - hrange - 1);
            TB(dst_c, dst_p, l) = sum_p / hnorm;
            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
        }
        // third domain : from (image_np-hrange) to (image_np-1)
        for (p = image_np - hrange; p < image_np; p++)
        {
            // dst_c and dst_p are the cluster index and the pixel index for B
            int dst_c = p / pixels_per_cluster;
            int dst_p = p % pixels_per_cluster;
            sum_p = sum_p + (int) TA(src_c, src_l, image_np - 1) 
                          - (int) TA(src_c, src_l, p - hrange - 1);
            TB(dst_c, dst_p, l) = sum_p / hnorm;
            TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
        }

#if SUPER_VERBOSE
get_cycle( &date );
printf(" - line %d computed at cycle %d\n", l, (unsigned int)date );
#endif    

    }

    get_cycle( &date );
    H_END[cid][lid] = (unsigned int)date;

#if VERBOSE_EXEC 
get_cycle( &date );
printf( "\n[convol] exec[%d] on core[%x,%d] completed horizontal filter / cycle %d\n",
tid , cxy , lpid , (unsigned int)date );
#endif

    ////////////////////////////////
    pthread_barrier_wait( &barrier );

    ///////////////////////////////////////////////////////////////
    // parallel vertical filter : 
    // C <= Transpose(FV(B))
    // Each thread computes (image_np/nthreads) columns
    // The image must be extended :
    // if (l<0)    TB(cid,p,l) == TB(cid,p,0)
    // if (l>image_nl-1)   TB(cid,p,l) == TB(cid,p,image_nl-1)
    ///////////////////////////////////////////////////////////////

    get_cycle( &date );
    V_BEG[cid][lid] = (unsigned int)date;

    // l = global line index / p = pixel index in column

    for (p = column_pmin; p < column_pmax ; p++)
    {
        // src_c and src_p are the cluster index and the pixel index for B
        int src_c = p / pixels_per_cluster;
        int src_p = p % pixels_per_cluster;

        int sum_l;

        // We use the specific values of the vertical ep-filter
        // To minimize the number of tests, the image_nl lines are split in three domains 

        // first domain : explicit computation for the first 18 values
        for (l = 0; l < 18; l++)
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l / lines_per_cluster;
            int dst_l = l % lines_per_cluster;

            for (z = 0, sum_l = 0; z < 35; z++)
            {
                sum_l = sum_l + vf[z] * TB(src_c, src_p, max(l - 17 + z,0) );
            }
            TC(dst_c, dst_l, p) = sum_l / vnorm;
        }
        // second domain
        for (l = 18; l < image_nl - 17; l++)
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l / lines_per_cluster;
            int dst_l = l % lines_per_cluster;

            sum_l = sum_l + TB(src_c, src_p, l + 4)
                  + TB(src_c, src_p, l + 8)
                  + TB(src_c, src_p, l + 11)
                  + TB(src_c, src_p, l + 15)
                  + TB(src_c, src_p, l + 17)
                  - TB(src_c, src_p, l - 5)
                  - TB(src_c, src_p, l - 9)
                  - TB(src_c, src_p, l - 12)
                  - TB(src_c, src_p, l - 16)
                  - TB(src_c, src_p, l - 18);

            TC(dst_c, dst_l, p) = sum_l / vnorm;
        }
        // third domain
        for (l = image_nl - 17; l < image_nl; l++)
        {
            // dst_c and dst_l are the cluster index and the line index for C
            int dst_c = l / lines_per_cluster;
            int dst_l = l % lines_per_cluster;

            sum_l = sum_l + TB(src_c, src_p, min(l + 4, image_nl - 1))
                  + TB(src_c, src_p, min(l + 8, image_nl - 1))
                  + TB(src_c, src_p, min(l + 11, image_nl - 1))
                  + TB(src_c, src_p, min(l + 15, image_nl - 1))
                  + TB(src_c, src_p, min(l + 17, image_nl - 1))
                  - TB(src_c, src_p, l - 5)
                  - TB(src_c, src_p, l - 9)
                  - TB(src_c, src_p, l - 12)
                  - TB(src_c, src_p, l - 16)
                  - TB(src_c, src_p, l - 18);

            TC(dst_c, dst_l, p) = sum_l / vnorm;
        }

#if SUPER_VERBOSE
get_cycle( &date );
printf(" - column %d computed at cycle %d\n", p, (unsigned int)date );
#endif 

    }

    get_cycle( &date );
    V_END[cid][lid] = (unsigned int)date;

#if VERBOSE_EXEC 
get_cycle( &date );
printf( "\n[convol] exec[%d] on core[%x,%d] completed vertical filter / cycle %d\n",
tid , cxy , lid , (unsigned int)date );
#endif

    ////////////////////////////////
    pthread_barrier_wait( &barrier );

    ///////////////////////////////////////////////////////////////
    // build final image in local Z buffer from C & D local buffers
    // store it in output image file, and display it on FBF.
    // Z <= C + D
    ///////////////////////////////////////////////////////////////

    get_cycle( &date );
    F_BEG[cid][lid] = (unsigned int)date;

    // Each thread[tid] set local buffer Z[cid] from local buffers C[cid] & D[cid]

    for( l = local_lmin ; l < local_lmax ; l++ )
    {
        for( p = 0 ; p < image_np ; p++ )
        {
            TZ(cid,l,p) = TC(cid,l,p) + TD(cid,l,p);
        }
    }

    // Each thread[tid] copy npixels from Z[cid] buffer to image_out buffer
    memcpy( image_out + g_offset,
            Z[cid]    + l_offset,
            npixels );

    // Optional parallel display of the final image 
    if ( FINAL_DISPLAY_ENABLE )
    {
        // each thread[tid] copy npixels from Z[cid] to out_win_buf buffer
        memcpy( out_win_buf + g_offset,
                Z[cid]      + l_offset,
                npixels );

        // refresh the FBF window 
        if( fbf_refresh_window( out_wid , global_lmin , global_lmax ) )
        {
            printf("\n[convol error] in %s : thread[%d] cannot access FBF\n",
            __FUNCTION__ , tid );
            pthread_exit( &THREAD_EXIT_FAILURE );
        }

#if VERBOSE_EXEC 
get_cycle( &date );
printf( "\n[convol] exec[%d] on core[%x,%d] completed final display / cycle %d\n",
tid , cxy , lpid , (unsigned int)date );
#endif

    }

    // Each thread[cid,0] releases the 5 local buffers
    if( lid == 0 )
    {
        free( A[cid] );
        free( B[cid] );
        free( C[cid] );
        free( D[cid] );
        free( Z[cid] );
    }

    get_cycle( &date );
    F_END[cid][lid] = (unsigned int)date;

    // thread termination depends on the placement policy
    if( PARALLEL_PLACEMENT )   
    {
        // <exec> threads are runing in detached mode, and
        // each thread must signal completion by calling barrier
        // passed in arguments before exit

        pthread_barrier_wait( args->barrier );

        pthread_exit( &THREAD_EXIT_SUCCESS );
    }
    else
    {
        // <exec> threads are running in attached mode
        // all threads (but the one executing main) exit
        if ( tid != tid_main ) pthread_exit( &THREAD_EXIT_SUCCESS );
    }

    return NULL;

} // end execute()


//////////////////////////
void instrument( FILE * f,
                 char * filename )
{
    unsigned int nclusters = x_size * y_size;

    unsigned int cc, pp;

    unsigned int min_start = 0xFFFFFFFF;
    unsigned int max_start = 0;

    unsigned int min_h_beg = 0xFFFFFFFF;
    unsigned int max_h_beg = 0;

    unsigned int min_h_end = 0xFFFFFFFF;
    unsigned int max_h_end = 0;

    unsigned int min_v_beg = 0xFFFFFFFF;
    unsigned int max_v_beg = 0;

    unsigned int min_v_end = 0xFFFFFFFF;
    unsigned int max_v_end = 0;

    unsigned int min_f_beg = 0xFFFFFFFF;
    unsigned int max_f_beg = 0;

    unsigned int min_f_end = 0xFFFFFFFF;
    unsigned int max_f_end = 0;

    for (cc = 0; cc < nclusters; cc++)
    {
        for (pp = 0; pp < ncores; pp++ )
        {
            if (START[cc][pp] < min_start) min_start = START[cc][pp];
            if (START[cc][pp] > max_start) max_start = START[cc][pp];

            if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp];
            if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp];

            if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp];
            if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp];

            if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp];
            if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp];

            if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp];
            if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp];

            if (F_BEG[cc][pp] < min_f_beg) min_f_beg = F_BEG[cc][pp];
            if (F_BEG[cc][pp] > max_f_beg) max_f_beg = F_BEG[cc][pp];

            if (F_END[cc][pp] < min_f_end) min_f_end = F_END[cc][pp];
            if (F_END[cc][pp] > max_f_end) max_f_end = F_END[cc][pp];
        }
    }

    // display on terminal
    printf( "\n ------ %s ------\n" , filename );

    printf(" - START : min = %d / max = %d / med = %d / delta = %d\n",
           min_start, max_start, (min_start+max_start)/2, max_start-min_start);

    printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);

    printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);

    printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);

    printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);

    printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_f_beg, max_f_beg, (min_f_beg+max_f_beg)/2, max_f_beg-min_f_beg);

    printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_f_end, max_f_end, (min_f_end+max_f_end)/2, max_f_end-min_f_end);

    printf( "\n General Scenario   (Kcycles)\n" );
    printf( " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
    printf( " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
    printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
    printf( " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
    printf( " - BARRIER VERT/DISP = %d\n", (min_f_beg - max_v_end)/1000 );
    printf( " - DISPLAY           = %d\n", (max_f_end - min_f_beg)/1000 );
    printf( " \nSEQUENCIAL = %d / PARALLEL = %d\n",
            SEQUENCIAL_TIME/1000, PARALLEL_TIME/1000 );

    // save on disk
    fprintf( f ,  "\n ------ %s ------\n" , filename );

    fprintf( f , " - START : min = %d / max = %d / med = %d / delta = %d\n",
           min_start, max_start, (min_start+max_start)/2, max_start-min_start);

    fprintf( f , " - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);

    fprintf( f , " - H_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);

    fprintf( f , " - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);

    fprintf( f , " - V_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);

    fprintf( f , " - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
           min_f_beg, max_f_beg, (min_f_beg+max_f_beg)/2, max_f_beg-min_f_beg);

    fprintf( f , " - D_END : min = %d / max = %d / med = %d / delta = %d\n",
           min_f_end, max_f_end, (min_f_end+max_f_end)/2, max_f_end-min_f_end);

    fprintf( f ,  "\n General Scenario (Kcycles)\n" );
    fprintf( f ,  " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
    fprintf( f ,  " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
    fprintf( f ,  " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
    fprintf( f ,  " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
    fprintf( f ,  " - BARRIER VERT/DISP = %d\n", (min_f_beg - max_v_end)/1000 );
    fprintf( f ,  " - SAVE              = %d\n", (max_f_end - min_f_beg)/1000 );
    fprintf( f ,  " \nSEQUENCIAL = %d / PARALLEL = %d\n",
    SEQUENCIAL_TIME/1000, PARALLEL_TIME/1000 );

} // end instrument()


// Local Variables:
// tab-width: 3
// c-basic-offset: 3
// c-file-offsets:((innamespace . 0)(inline-open . 0))
// indent-tabs-mode: nil
// End:

// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3