Changeset 652 for trunk/user/convol


Ignore:
Timestamp:
Nov 14, 2019, 3:56:51 PM (4 years ago)
Author:
alain
Message:

Introduce the three placement modes in "transpose", "convol', "fft" applications.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/user/convol/convol.c

    r645 r652  
    55///////////////////////////////////////////////////////////////////////////////////////
    66// This multi-threaded application implements a 2D convolution product. 
    7 // It can run on a multi-processors, multi-clusters architecture, with one thread
    8 // per processor, and uses the POSIX threads API.
     7// It can run on a multi-cores, multi-clusters architecture, with one thread
     8// per core, and uses the POSIX threads API.
    99//
    1010// The main() function can be launched on any processor P[x,y,l].
     
    1414// when the parallel execution is completed.
    1515//
    16 // The convolution kernel is [201]*[35] pixels, but it can be factored in two
    17 // independant line and column convolution products.
     16// The convolution kernel is defined in the execute() function.
     17// It can be factored in two independant line and column convolution products.
    1818// The five buffers containing the image are distributed in clusters.
     19// For the philips image, it is a [201]*[35] pixels rectangle, and the.
    1920//
    2021// The (1024 * 1024) pixels image is read from a file (2 bytes per pixel).
    2122//
    2223// - number of clusters containing processors must be power of 2 no larger than 256.
    23 // - number of processors per cluster must be power of 2 no larger than 8.
     24// - number of processors per cluster must be power of 2 no larger than 4.
     25//
     26// The number N of working threads is always defined by the number of cores availables
     27// in the architecture, but this application supports three placement modes.
     28// In all modes, the working threads are identified by the [tid] continuous index
     29// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
     30// This continuous index can always be decomposed in two continuous sub-indexes:
     31// tid == cid * ncores + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
     32//
     33// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
     34//   threads are created by the main thread, but the placement is done by the OS, using
     35//   the DQDT for load balancing, and two working threads can be placed on the same core.
     36//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
     37//   cluster or a physical core. In this mode, the main thread run on any cluster,
     38//   but has tid = 0 (i.e. cid = 0 & tid = 0).
     39//
     40// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of
     41//   of the threads on the cores is explicitely controled by the main thread to have
     42//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
     43//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
     44//   physical cluster identifier, and [lid] is the local core index.
     45//
     46// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
     47//   non standard pthread_parallel_create() function to avoid the costly sequencial
     48//   loops for pthread_create() and pthread_join(). It garanty one working thread
     49//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
     50//
     51// The [tid] continuous index defines how the work is shared amongst the threads:
     52// - each thread handles NL/nthreads lines for the horizontal filter.
     53// - each thread handles NP/nthreads columns for the vertical filter.
    2454///////////////////////////////////////////////////////////////////////////////////////
    2555
     56#include <sys/mman.h>
    2657#include <stdio.h>
    2758#include <stdlib.h>
     
    2960#include <unistd.h>
    3061#include <pthread.h>
     62#include <string.h>
    3163#include <almosmkh.h>
    3264#include <hal_macros.h>
    3365
    34 #define IMAGE_IN_PATH              "misc/philips_1024.raw"
    35 
    36 #define USE_SQT_BARRIER            1
    37 #define VERBOSE                    1
    38 #define SUPER_VERBOSE              0
    39 
    40 #define USE_DQT_BARRIER            1
     66#define VERBOSE_MAIN               1
     67#define VERBOSE_EXEC               1
    4168
    4269#define X_MAX                      16
    4370#define Y_MAX                      16
    44 #define PROCS_MAX                  4
     71#define CORES_MAX                  4
    4572#define CLUSTERS_MAX               (X_MAX * Y_MAX)
    46 #define THREADS_MAX                (X_MAX * Y_MAX * PROCS_MAX]
    47 
    48 #define INITIAL_DISPLAY_ENABLE     1
    49 #define FINAL_DISPLAY_ENABLE       1
    50 
    51 #define PIXEL_SIZE                 2       // input image has 2 bytes per pixel
    52 #define FBF_TYPE                   420     // output image has 1 byte per pixel
    53 
     73#define THREADS_MAX                (X_MAX * Y_MAX * CORES_MAX)
     74
     75#define IMAGE_IN_PATH              "misc/philips_1024_2.raw"
     76#define IMAGE_IN_PIXEL_SIZE        2                               // 2 bytes per pixel
     77
     78#define IMAGE_OUT_PATH             "misc/philips_after_1O24.raw"
     79#define IMAGE_OUT_PIXEL_SIZE       1                               // 1 bytes per pixel
     80
     81#define FBF_TYPE                   420
    5482#define NL                         1024
    5583#define NP                         1024
    5684#define NB_PIXELS                  (NP * NL)
    57 #define FRAME_SIZE                 (NB_PIXELS * PIXEL_SIZE)
    58 
     85
     86#define NO_PLACEMENT               0
     87#define EXPLICIT_PLACEMENT         0
     88#define PARALLEL_PLACEMENT         1
     89
     90#define USE_DQT_BARRIER            1
     91#define INITIAL_DISPLAY_ENABLE     1
     92#define FINAL_DISPLAY_ENABLE       1
    5993
    6094#define TA(c,l,p)  (A[c][((NP) * (l)) + (p)])
     
    68102
    69103//////////////////////////////////////////////////////////
    70 //   global variables stored in seg_data in cluster[0,0]
     104//            global variables
    71105//////////////////////////////////////////////////////////
    72106
    73 // Instrumentation counters (cluster_id, lpid]
    74 unsigned int START[CLUSTERS_MAX][PROCS_MAX];
    75 unsigned int H_BEG[CLUSTERS_MAX][PROCS_MAX];
    76 unsigned int H_END[CLUSTERS_MAX][PROCS_MAX];
    77 unsigned int V_BEG[CLUSTERS_MAX][PROCS_MAX];
    78 unsigned int V_END[CLUSTERS_MAX][PROCS_MAX];
    79 unsigned int D_BEG[CLUSTERS_MAX][PROCS_MAX];
    80 unsigned int D_END[CLUSTERS_MAX][PROCS_MAX];
    81 
    82 // file pointers on input image
    83 FILE * f_image_in;
    84 FILE * f_instrum;
     107// global instrumentation counters for the main thread
     108unsigned int SEQUENCIAL_TIME = 0;
     109unsigned int PARALLEL_TIME   = 0;
     110
     111// instrumentation counters for thread[tid] in cluster[cid]
     112unsigned int START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     113unsigned int H_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     114unsigned int H_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     115unsigned int V_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     116unsigned int V_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     117unsigned int D_BEG[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     118unsigned int D_END[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
     119
     120// pointer on buffer containing the input image, maped by the main to the input file
     121unsigned char *  image_in;
     122
     123// pointer on buffer containing the output image, maped by the main to the output file
     124unsigned char *  image_out;
    85125
    86126// return values at thread exit
     
    91131pthread_barrier_t     barrier;
    92132
    93 // coordinates of core executing the main thread
    94 unsigned int cxy_main;
    95 unsigned int lid_main;
     133// platform parameters
     134unsigned int  x_size;              // number of clusters in a row
     135unsigned int  y_size;              // number of clusters in a column
     136unsigned int  ncores;              // number of processors per cluster
    96137
    97138// arrays of pointers on distributed buffers in all clusters
    98139unsigned short * GA[CLUSTERS_MAX];
    99 int *            GB[CLUSTERS_MAX];
    100 int *            GC[CLUSTERS_MAX];
    101 int *            GD[CLUSTERS_MAX];
    102 unsigned char *  GZ[CLUSTERS_MAX];
    103 
    104 // trdid[] array for execution threads
    105 // 1D array if no explicit threads placement / 2D array if explicit placement
    106 pthread_t        trdid[CLUSTERS_MAX][PROCS_MAX];
    107 //pthread_t        trdid[THREADS_MAX];
    108 
    109 // attr[] array for execution threads
    110 // unused if no explicit threads placement
    111 pthread_attr_t   attr[CLUSTERS_MAX][PROCS_MAX];
     140int            * GB[CLUSTERS_MAX];
     141int            * GC[CLUSTERS_MAX];
     142int            * GD[CLUSTERS_MAX];
     143unsigned char  * GZ[CLUSTERS_MAX];
     144
     145// array of threads kernel identifiers / indexed by [tid]
     146pthread_t        exec_trdid[THREADS_MAX];
     147
     148// array of threads attributes / indexed bi [tid]
     149pthread_attr_t   exec_attr[THREADS_MAX];
     150
     151// array of execute() function arguments / indexed by [tid]
     152pthread_parallel_work_args_t exec_args[THREADS_MAX];
     153
     154// main thread continuous index
     155unsigned int     tid_main;
    112156
    113157/////////////////////////////////////////////////////////////////////////////////////
     
    115159/////////////////////////////////////////////////////////////////////////////////////
    116160
    117 void execute( void );
    118 
    119 void instrument( unsigned int nclusters,
    120                  unsigned int ncores );
     161void execute( pthread_parallel_work_args_t * args );
     162
     163void instrument( FILE * f , char * filename );
    121164
    122165/////////////////
    123166void main( void )
    124167{
    125     unsigned int x_size;                 // number of clusters in a row
    126     unsigned int y_size;                 // number of clusters in a column
    127     unsigned int ncores;                 // number of processors per cluster
    128 
    129     unsigned long long  date;
    130 
    131     char         name[64];               // instrumentation file name
    132     char         path[128];              // instrumentation path name
     168    unsigned long long start_cycle;
     169    unsigned long long end_sequencial_cycle;
     170    unsigned long long end_parallel_cycle;
    133171
    134172    int          error;
    135173
    136     // get platform parameters
    137     if ( get_config( &x_size , &y_size , &ncores ) )
    138     {
    139         printf("\n[convol error] cannot get hardware configuration\n");
     174    char         instru_name[32];               // instrumentation file name
     175    char         instru_path[64];              // instrumentation path name
     176
     177    /////////////////////////////////////////////////////////////////////////////////
     178    get_cycle( &start_cycle );
     179    /////////////////////////////////////////////////////////////////////////////////
     180
     181    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
     182    {
     183        printf("\n[convol error] illegal placement\n");
    140184        exit( 0 );
    141185    }
    142186
    143     // get core executing this main thread
    144     // and register these coordinates in global variables
    145     get_core_id( &cxy_main , &lid_main );
    146    
    147     // check ncores
    148     if( (ncores != 1) && (ncores != 2) && (ncores != 4) )
     187    // get & check platform parameters
     188    get_config( &x_size , &y_size , &ncores );
     189
     190    if((ncores != 1) && (ncores != 2) && (ncores != 4))
    149191    {
    150192        printf("\n[convol error] number of cores per cluster must be 1/2/4\n");
     
    152194    }
    153195
    154     // check x_size
    155     if( (x_size != 1) && (x_size != 2) && (x_size != 4) && (x_size != 8) && (x_size != 16) )
     196    if( (x_size != 1) && (x_size != 2) && (x_size != 4) &&
     197        (x_size != 8) && (x_size != 16) )
    156198    {
    157199        printf("\n[convol error] x_size must be 1/2/4/8/16\n");
    158200        exit( 0 );
    159201    }
    160 
    161     // check y_size
    162     if( (y_size != 1) && (y_size != 2) && (y_size != 4) && (y_size != 8) && (y_size != 16) )
     202       
     203    if( (y_size != 1) && (y_size != 2) && (y_size != 4) &&
     204        (y_size != 8) && (y_size != 16) )
    163205    {
    164206        printf("\n[convol error] y_size must be 1/2/4/8/16\n");
    165207        exit( 0 );
    166208    }
     209       
     210    // main thread get identifiers for core executing main
     211    unsigned int  cxy_main;
     212    unsigned int  lid_main;
     213    get_core_id( &cxy_main , &lid_main );
    167214
    168215    // compute nthreads and nclusters
    169     unsigned int nthreads  = x_size * y_size * ncores;
    170216    unsigned int nclusters = x_size * y_size;
    171 
    172     get_cycle( &date );
    173     printf("\n[convol] starts on core[%x,%d] / %d thread(s) / cycle %d\n",
    174     cxy_main, lid_main, nthreads, (unsigned int)date );
    175 
    176     // build instrumentation file name
    177     if( USE_DQT_BARRIER )
    178     snprintf( name , 64 , "p_convol_dqt_%d_%d", x_size * y_size , ncores );
    179     else
    180     snprintf( name , 64 , "p_convol_smp_%d_%d", x_size * y_size , ncores );
    181 
    182     // build pathname
    183     snprintf( path , 128 , "/home/%s", name );
     217    unsigned int nthreads  = nclusters * ncores;
     218
     219    // main thread get FBF size and type
     220    unsigned int   fbf_width;
     221    unsigned int   fbf_height;
     222    unsigned int   fbf_type;
     223    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
     224
     225    if( (fbf_width != NP) || (fbf_height != NL) || (fbf_type != FBF_TYPE) )
     226    {
     227        printf("\n[convol error] image does not fit FBF size or type\n");
     228        exit( 0 );
     229    }
     230
     231    if( nthreads > NL )
     232    {
     233        printf("\n[convol error] number of threads larger than number of lines\n");
     234        exit( 0 );
     235    }
     236
     237    // define instrumentation file name
     238    if( NO_PLACEMENT )
     239    {
     240        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / NO_PLACE\n",
     241        nclusters, ncores, fbf_width, fbf_height, getpid() );
     242
     243        // build instrumentation file name
     244        if( USE_DQT_BARRIER )
     245        snprintf( instru_name , 32 , "conv_dqt_no_place_%d_%d", x_size * y_size , ncores );
     246        else
     247        snprintf( instru_name , 32 , "conv_smp_no_place_%d_%d", x_size * y_size , ncores );
     248    }
     249
     250    if( EXPLICIT_PLACEMENT )
     251    {
     252        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / EXPLICIT\n",
     253        nclusters, ncores, fbf_width, fbf_height, getpid() );
     254
     255        // build instrumentation file name
     256        if( USE_DQT_BARRIER )
     257        snprintf( instru_name , 32 , "conv_dqt_explicit_%d_%d_%d", x_size * y_size , ncores );
     258        else
     259        snprintf( instru_name , 32 , "conv_smp_explicit_%d_%d_%d", x_size * y_size , ncores );
     260    }
     261
     262    if( PARALLEL_PLACEMENT )
     263    {
     264        printf("\n[convol] %d cluster(s) / %d core(s) / FBF[%d*%d] / PID %x / PARALLEL\n",
     265        nclusters, ncores, fbf_width, fbf_height, getpid() );
     266
     267        // build instrumentation file name
     268        if( USE_DQT_BARRIER )
     269        snprintf( instru_name , 32 , "conv_dqt_parallel_%d_%d_%d", x_size * y_size , ncores );
     270        else
     271        snprintf( instru_name , 32 , "conv_smp_parallel_%d_%d_%d", x_size * y_size , ncores );
     272    }
    184273
    185274    // open instrumentation file
    186     f_instrum = fopen( path , NULL );
    187     if ( f_instrum == NULL )
     275    snprintf( instru_path , 64 , "/home/%s", instru_name );
     276    FILE * f_instru = fopen( instru_path , NULL );
     277    if ( f_instru == NULL )
    188278    {
    189         printf("\n[convol error] cannot open instrumentation file <%s>\n", path );
     279        printf("\n[convol error] cannot open instrumentation file %s\n", instru_path );
    190280        exit( 0 );
    191281    }
    192282
    193 #if DEBUG_MAIN
    194 get_cycle( &date );
    195 printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n",
    196 cxy_main, lid_main, path, (unsigned int)date );
    197 #endif
    198 
    199     // open input file
    200     f_image_in = fopen( IMAGE_IN_PATH , NULL );
    201     if ( f_image_in == NULL )
    202     {
    203         printf("\n[convol error] cannot open input file <%s>\n", IMAGE_IN_PATH );
    204         exit( 0 );
    205     }
    206 
    207 #if DEBUG_MAIN
    208 get_cycle( &date );
    209 printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n",
    210 cxy_main, lid_main, path, (unsigned int)date );
    211 #endif
    212    
    213     // get FBF config
    214     unsigned int  fbf_width;
    215     unsigned int  fbf_height;
    216     unsigned int  fbf_type;
    217     fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
    218 
    219     // check FBF size
    220     if ( (fbf_width != NP) || (fbf_height != NL) )
    221     {
    222         printf("\n[convol error] bad FBF size\n");
    223         exit( 0 );
    224     }
    225 
    226     // check FBF subsampling
    227     if ( fbf_type != FBF_TYPE )
    228     {
    229         printf("\n[convol error] bad FBF subsampling\n");
    230         exit( 0 );
    231     }
    232 
    233     // initialise barrier
     283#if  VERBOSE_MAIN
     284printf("\n[convol] main on core[%x,%d] open instrumentation file %s\n",
     285cxy_main, lid_main, instru_path );
     286#endif
     287
     288    // main initialise barrier
    234289    if( USE_DQT_BARRIER )
    235290    {
     
    251306    }
    252307
    253     get_cycle( &date );
    254     printf("\n[convol] main on core[%x,%d] completes initialisation at cycle %d\n"
    255            "- CLUSTERS     = %d\n"
    256            "- PROCS        = %d\n"
    257            "- THREADS      = %d\n",
    258            cxy_main, lid_main, (unsigned int)date, nclusters, ncores, nthreads );
    259 
    260     // launch exec threads with explicit placement
    261     unsigned int x;
    262     unsigned int y;
    263     unsigned int l;
    264     unsigned int cxy;
    265  
    266     for( x = 0 ; x < x_size ; x++ )
    267     {
    268         for( y = 0 ; y < y_size ; y++ )
    269         {
    270            cxy = HAL_CXY_FROM_XY(x,y);
    271            for( l = 0 ; l < ncores ; l++ )
    272            {
    273                // no other thread on the core running the main
    274                if( (cxy != cxy_main) || (l != lid_main) )
    275                {
    276                    // define thread attributes
    277                    attr[cxy][l].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED;
    278                    attr[cxy][l].cxy        = cxy;
    279                    attr[cxy][l].lid        = l;
     308#if VERBOSE_MAIN
     309printf("\n[convol] main on core[%x,%d] completes barrier init\n",
     310cxy_main, lid_main );
     311#endif
     312
     313    // main open input file
     314    int fd_in = open( IMAGE_IN_PATH , O_RDONLY , 0 );
     315
     316    if ( fd_in < 0 )
     317    {
     318        printf("\n[convol error] cannot open input file <%s>\n", IMAGE_IN_PATH );
     319        exit( 0 );
     320    }
     321
     322#if VERBOSE_MAIN
     323printf("\n[convol] main on core[%x,%d] open file <%s>\n",
     324cxy_main, lid_main, IMAGE_IN_PATH );
     325#endif
     326   
     327    // main thread map image_in buffer to input file
     328    image_in = (unsigned char *)mmap( NULL,
     329                                      NB_PIXELS * IMAGE_IN_PIXEL_SIZE,
     330                                      PROT_READ,
     331                                      MAP_FILE | MAP_SHARED,
     332                                      fd_in,
     333                                      0 );           // offset
     334    if ( image_in == NULL )
     335    {
     336        printf("\n[convol error] main cannot map buffer to file %s\n", IMAGE_IN_PATH );
     337        exit( 0 );
     338    }
     339
     340#if  VERBOSE_MAIN
     341printf("\n[convol] main on core[%x,%x] map buffer to file <%s>\n",
     342cxy_main, lid_main, IMAGE_IN_PATH );
     343#endif
     344
     345    // main thread open output file
     346    int fd_out = open( IMAGE_OUT_PATH , O_CREAT , 0 );
     347
     348    if ( fd_out < 0 )
     349    {
     350        printf("\n[convol error] main cannot open file %s\n", IMAGE_OUT_PATH );
     351        exit( 0 );
     352    }
     353
     354#if  VERBOSE_MAIN
     355printf("\n[convol] main on core[%x,%d] open file <%s>\n",
     356cxy_main, lid_main, IMAGE_OUT_PATH );
     357#endif
     358
     359    // main thread map image_out buffer to output file
     360    image_out = (unsigned char *)mmap( NULL,
     361                                       NB_PIXELS + IMAGE_OUT_PIXEL_SIZE,
     362                                       PROT_WRITE,
     363                                       MAP_FILE | MAP_SHARED,
     364                                       fd_out,
     365                                       0 );     // offset
     366    if ( image_out == NULL )
     367    {
     368        printf("\n[convol error] main cannot map buffer to file %s\n", IMAGE_OUT_PATH );
     369        exit( 0 );
     370    }
     371
     372#if  VERBOSE_MAIN
     373printf("\n[convol] main on core[%x,%x] map buffer to file <%s>\n",
     374cxy_main, lid_main, IMAGE_OUT_PATH );
     375#endif
     376
     377    /////////////////////////////////////////////////////////////////////////////////////
     378    get_cycle( &end_sequencial_cycle );
     379    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
     380    /////////////////////////////////////////////////////////////////////////////////////
     381
     382    //////////////////
     383    if( NO_PLACEMENT )
     384    {
     385        // the tid value for the main thread is always 0
     386        // main thread creates new threads with tid in [1,nthreads-1] 
     387        unsigned int tid;
     388        for ( tid = 0 ; tid < nthreads ; tid++ )
     389        {
     390            // register tid value in exec_args[tid] array
     391            exec_args[tid].tid = tid;
     392           
     393            // create other threads
     394            if( tid > 0 )
     395            {
     396                if ( pthread_create( &exec_trdid[tid],
     397                                     NULL,                  // no attribute
     398                                     &execute,
     399                                     &exec_args[tid] ) )
     400                {
     401                    printf("\n[convol error] cannot create thread %d\n", tid );
     402                    exit( 0 );
     403                }
     404
     405#if VERBOSE_MAIN
     406printf("\n[convol] main created thread %d\n", tid );
     407#endif
     408
     409            }
     410            else
     411            {
     412                tid_main = 0;
     413            }
     414        }  // end for tid
     415
     416        // main thread calls itself the execute() function
     417        execute( &exec_args[0] );
     418
     419        // main thread wait other threads completion
     420        for ( tid = 1 ; tid < nthreads ; tid++ )
     421        {
     422            unsigned int * status;
     423
     424            // main wait thread[tid] status
     425            if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
     426            {
     427                printf("\n[convol error] main cannot join thread %d\n", tid );
     428                exit( 0 );
     429            }
     430       
     431            // check status
     432            if( *status != THREAD_EXIT_SUCCESS )
     433            {
     434                printf("\n[convol error] thread %x returned failure\n", tid );
     435                exit( 0 );
     436            }
     437
     438#if VERBOSE_MAIN
     439printf("\n[convol] main successfully joined thread %x\n", tid );
     440#endif
     441       
     442        }  // end for tid
     443
     444    }  // end if no_placement
     445
     446    ////////////////////////
     447    if( EXPLICIT_PLACEMENT )
     448    {
     449        // main thread places each other threads on a specific core[cxy][lid]
     450        // but the actual thread creation is sequencial
     451        unsigned int x;
     452        unsigned int y;
     453        unsigned int l;
     454        unsigned int cxy;                   // cluster identifier
     455        unsigned int tid;                   // thread continuous index
     456
     457        for( x = 0 ; x < x_size ; x++ )
     458        {
     459            for( y = 0 ; y < y_size ; y++ )
     460            {
     461                cxy = HAL_CXY_FROM_XY( x , y );
     462                for( l = 0 ; l < ncores ; l++ )
     463                {
     464                    // compute thread continuous index
     465                    tid = (((x  * y_size) + y) * ncores) + l;
     466
     467                    // register tid value in exec_args[tid] array
     468                    exec_args[tid].tid = tid;
     469
     470                    // no thread created on the core running the main
     471                    if( (cxy != cxy_main) || (l != lid_main) )
     472                    {
     473                        // define thread attributes
     474                        exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
     475                                                    PT_ATTR_CORE_DEFINED;
     476                        exec_attr[tid].cxy        = cxy;
     477                        exec_attr[tid].lid        = l;
    280478 
    281                    // create thread on core[x,y,l]
    282                    if (pthread_create( &trdid[cxy][l],
    283                                        &attr[cxy][l],   
    284                                        &execute,
    285                                        NULL ) )     // execute has no argument
    286                    {
    287                        printf("\n[convol error] created thread %x on core[%x][%d]\n",
    288                        trdid[cxy][l] , cxy , l );
    289                        exit( 0 );
    290                    }
    291                 }
    292             }
    293         }
    294     }   
    295 
    296 /*
    297     // launch other threads without explicit placement
    298     for ( n = 1 ; n < nthreads ; n++ )
    299     {
    300         if ( giet_pthread_create( &trdid[n],
    301                                   NULL,                  // no attribute
    302                                   &execute,
    303                                   NULL ) )               // no argument
    304         {
    305             printf("\n[convol error] creating thread %x\n", trdid[n] );
    306             exit( 0 );
    307         }
    308     }
    309 */
    310 
    311     // the main thread run itself the execute() function
    312     execute();
    313 
    314     // wait other threads completions if explicit threads placement
    315     for( x = 0 ; x < x_size ; x++ )
    316     {
    317         for( y = 0 ; y < y_size ; y++ )
    318         {
    319             unsigned int cxy = HAL_CXY_FROM_XY(x,y);
    320             for( l = 0 ; l < ncores ; l++ )
    321             {
    322                 // no other thread on the core running the main
    323                 if( (cxy != cxy_main) || (l != lid_main) )
    324                 {
    325                     unsigned int * exit_status;
    326 
    327                     // wait thread running on core[x,y,l]
    328                     if (pthread_join( trdid[cxy][l] , (void*)(&exit_status) ) )
     479                        // create thread[tid] on core[cxy][l]
     480                        if ( pthread_create( &exec_trdid[tid],   
     481                                             &exec_attr[tid],   
     482                                             &execute,
     483                                             &exec_args[tid] ) )       
     484                        {
     485                            printf("\n[convol error] cannot create thread %d\n", tid );
     486                            exit( 0 );
     487                        }
     488#if VERBOSE_MAIN
     489printf("\n[convol] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
     490#endif
     491                    }
     492                    else
    329493                    {
    330                         printf("\n[convol error] main cannot join thread[%x,%d]\n", cxy, l );
    331                         exit( 0 );
    332                     }
    333 
    334                     // check exit_status
    335                     if( *exit_status != 0 )
    336                     {
    337                         printf("\n[convol error] thread[%x,%d]return failure\n", cxy, l );
    338                         exit( 0 );
     494                        tid_main = tid;
    339495                    }
    340496                }
    341497            }
    342498        }
    343     }
    344 /*   
    345     // wait other threads completion when no explicit threads placement
    346     for ( n = 1 ; n < nthreads ; n++ )
    347     {
    348         if ( pthread_join( trdid[n], NULL ) )
    349         {
    350             printf("\n[convol error] joining thread %x\n", trdid[n] );
     499
     500        // main thread calls itself the execute() function
     501        execute( &exec_args[tid_main] );
     502
     503        // main thread wait other threads completion
     504        for( tid = 0 ; tid < nthreads ; tid++ )
     505        {
     506            // no other thread on the core running the main
     507            if( tid != tid_main )
     508            {
     509                unsigned int * status;
     510
     511                // wait thread[tid]
     512                if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
     513                {
     514                    printf("\n[convol error] main cannot join thread %d\n", tid );
     515                    exit( 0 );
     516                }
     517       
     518                // check status
     519                if( *status != THREAD_EXIT_SUCCESS )
     520                {
     521                    printf("\n[convol error] thread %d returned failure\n", tid );
     522                    exit( 0 );
     523                }
     524#if VERBOSE_MAIN
     525printf("\n[convol] main joined thread %d on core[%x,%d]\n", tid , cxy , l );
     526#endif
     527            }
     528        }
     529    }  // end if explicit_placement
     530
     531    ////////////////////////
     532    if( PARALLEL_PLACEMENT )
     533    {
     534        // compute covering DQT size an level
     535        unsigned int z          = (x_size > y_size) ? x_size : y_size;
     536        unsigned int root_level = ((z == 1) ? 0 :
     537                                  ((z == 2) ? 1 :
     538                                  ((z == 4) ? 2 :
     539                                  ((z == 8) ? 3 : 4))));
     540
     541        // create & execute the working threads
     542        if( pthread_parallel_create( root_level , &execute ) )
     543        {
     544            printf("\n[convol error] in %s\n", __FUNCTION__ );
    351545            exit( 0 );
    352546        }
    353     }
    354 */
    355     // call the instrument() function
    356     instrument( nclusters , ncores );
    357 
     547    }  // end if parallel_placement
     548
     549    /////////////////////////////////////////////////////////////////////////////
     550    get_cycle( &end_parallel_cycle );
     551    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
     552    /////////////////////////////////////////////////////////////////////////////
     553
     554    // main thread register instrumentation results
     555    instrument( f_instru , instru_name );
     556
     557    // main thread close input file
     558    close( fd_in );
     559
     560    // main thread close output file
     561    close( fd_out );
     562
     563    // main thread close instrumentation file
     564    fclose( f_instru );
     565
     566    // main thread suicide
    358567    exit( 0 );
    359568   
     
    362571
    363572
    364 //////////////
    365 void execute()
     573
     574
     575
     576///////////////////////////////////////////////////
     577void execute( pthread_parallel_work_args_t * args )
    366578{
    367579    unsigned long long date;
    368580
    369     // Each thread[x,y,p] initialises the convolution kernel parameters in local stack.
     581    // Each thread initialises the convolution kernel parameters in local stack.
    370582    // The values defined in the next 12 lines are Philips proprietary information.
    371583
     
    382594    unsigned int hnorm  = 201;
    383595
    384     // get plat-form config
    385     unsigned int x_size;            // number of clusters in a row
    386     unsigned int y_size;            // number of clusters in a column
    387     unsigned int ncores;            // number of processors per cluster
    388     get_config( &x_size , &y_size , &ncores );
    389 
    390     // get cluster indentifier and core local index
    391     unsigned int cxy;
    392     unsigned int lid;
    393     get_core_id( &cxy , &lid );
    394     unsigned int x = HAL_X_FROM_CXY( cxy );
    395     unsigned int y = HAL_Y_FROM_CXY( cxy );
     596    // WARNING
     597    //A thread is identified by the tid index, defined in the "args" structure.
     598    // This index being in range [0,nclusters*ncores-1] we can always write
     599    //       tid == cid * ncores + lid
     600    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
     601    // if NO_PLACEMENT, there is no relation between these
     602    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]
     603
     604    // get thread abstract identifiers
     605    unsigned int tid = args->tid;
     606    unsigned int cid = tid / ncores;   
     607    unsigned int lid = tid % ncores;
     608
     609#if VERBOSE_EXEC
     610unsigned int cxy;              // core cluster identifier
     611unsigned int lpid;             // core local identifier
     612get_core_id( &cxy , &lpid );
     613printf("\n[convol] exec[%d] on core[%x,%d] enters parallel exec\n",
     614tid , cxy , lpid );
     615#endif
     616
     617    // build total number of threads and clusters from global variables
     618    unsigned int nclusters = x_size * y_size;
     619    unsigned int nthreads  = nclusters * ncores;
    396620
    397621    // indexes for loops
     
    401625    unsigned int z;                 // vertical filter index
    402626
    403     unsigned int nclusters  = x_size * y_size;              // number of clusters
    404     unsigned int cluster_id = (x * y_size) + y;             // continuous cluster index
    405     unsigned int thread_id  = (cluster_id * ncores) + lid;  // continuous thread index
    406     unsigned int nthreads   = nclusters * ncores;           // number of threads
    407     unsigned int frame_size = FRAME_SIZE;                   // total size (bytes)
    408     unsigned int lines_per_thread   = NL / nthreads;        // lines per thread
    409     unsigned int lines_per_cluster  = NL / nclusters;       // lines per cluster
    410     unsigned int pixels_per_thread  = NP / nthreads;        // columns per thread
    411     unsigned int pixels_per_cluster = NP / nclusters;       // columns per cluster
     627    unsigned int lines_per_thread   = NL / nthreads;
     628    unsigned int lines_per_cluster  = NL / nclusters;
     629    unsigned int pixels_per_thread  = NP / nthreads;
     630    unsigned int pixels_per_cluster = NP / nclusters;
     631
     632    // compute number of pixels stored in one abstract cluster cid
     633    unsigned int local_pixels = NL * NP / nclusters;       
    412634
    413635    unsigned int first, last;
    414636
    415637    get_cycle( &date );
    416     START[cluster_id][lid] = (unsigned int)date;
    417 
    418     // Each thread[cxy][0] allocate the global buffers in cluster cxy
     638    START[cid][lid] = (unsigned int)date;
     639
     640    // Each thread[cid][0] allocates 5 local buffers,
     641    // shared by all threads that have the same cid
    419642    if ( lid == 0 )
    420643    {
    421 
    422 #if VERBOSE
    423 printf( "\n[convol] thread[%x,%d] enters malloc at cycle %d\n",
    424 cxy , lid , (unsigned int)date );
    425 #endif
    426 
    427         GA[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)   , cxy );
    428         GB[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy );
    429         GC[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy );
    430         GD[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy );
    431         GZ[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)/2 , cxy );
    432        
    433 #if VERBOSE
    434 printf( "\n[convol]  Shared Buffer Virtual Addresses in cluster %x\n"
    435         "### GA = %x\n"
    436         "### GB = %x\n"               
    437         "### GC = %x\n"               
    438         "### GD = %x\n"               
    439         "### GZ = %x\n",
    440         cxy,
    441         GA[cluster_id],
    442         GB[cluster_id],
    443         GC[cluster_id],
    444         GD[cluster_id],
    445         GZ[cluster_id] );
     644        GA[cid] = malloc( local_pixels * sizeof( unsigned short ) );
     645        GB[cid] = malloc( local_pixels * sizeof( int ) );
     646        GC[cid] = malloc( local_pixels * sizeof( int ) );
     647        GD[cid] = malloc( local_pixels * sizeof( int ) );
     648        GZ[cid] = malloc( local_pixels * sizeof( unsigned char ) );
     649
     650        if( (GA[cid] == NULL) || (GB[cid] == NULL) || (GC[cid] == NULL) ||
     651            (GD[cid] == NULL) || (GZ[cid] == NULL) )
     652        {
     653            printf("\n[convol error] thread[%d] cannot allocate buf_in\n", tid );
     654            pthread_exit( &THREAD_EXIT_FAILURE );
     655        }
     656
     657#if VERBOSE_EXEC
     658printf( "\n[convol] exec[%d] on core[%x,%d] allocated shared buffers\n"
     659"### GA = %x\n"
     660"### GB = %x\n"               
     661"### GC = %x\n"               
     662"### GD = %x\n"               
     663"### GZ = %x\n",
     664tid, cxy , lpid, GA[cid], GB[cid], GC[cid], GD[cid], GZ[cid] );
    446665#endif
    447666   
     
    451670    pthread_barrier_wait( &barrier );
    452671
    453     // Each thread[cxy,p] initialise in its private stack a copy of the
    454     // arrays of pointers on the shared, distributed buffers.
     672    // Each thread[cid,lid] allocate and initialise in its private stack
     673    // a copy of the arrays of pointers on the distributed buffers.
    455674    unsigned short * A[CLUSTERS_MAX];
    456675    int            * B[CLUSTERS_MAX];
     
    468687    }
    469688
    470     // Each thread[x,y,0] access the file containing the input image, to load
    471     // the local A[c] buffer (frame_size / nclusters loaded in each cluster).
    472     // Other threads are waiting on the barrier.
     689    // Each thread[cid,0] access the file containing the input image, to load
     690    // the local A[cid] buffer. Other threads are waiting on the barrier.
    473691    if ( lid==0 )
    474692    {
    475         unsigned int offset = (frame_size/nclusters)*cluster_id;
    476         unsigned int size   = frame_size/nclusters;
    477 
    478         // seek the pointer in file
    479         if ( fseek( f_image_in,
    480                     offset,
    481                     SEEK_SET ) )
    482         {
    483             printf("\n[convol error] in %s : thread[%x,%d] cannot seek input file\n",
    484             __FUNCTION__ , cxy , lid );
    485             pthread_exit( &THREAD_EXIT_FAILURE );
    486         }
    487 
    488         if ( fread( A[cluster_id],
    489                     1,
    490                     size,
    491                     f_image_in ) != size )
    492         {
    493             printf("\n[convol error] in %s : thread[%x,%d] cannot read input file\n",
    494             __FUNCTION__ , cxy , lid );
    495             pthread_exit( &THREAD_EXIT_FAILURE );
    496         }
     693        unsigned int size   = local_pixels * sizeof( unsigned short );
     694        unsigned int offset = size * cid;
     695
     696        memcpy( A[cid],
     697                image_in + offset,
     698                size );
    497699 
    498 #if VERBOSE
     700#if VERBOSE_EXEC
    499701get_cycle( &date );
    500 printf( "\n[convol] thread[%x,%d] load input file at cycle %d\n",
    501 cxy , lid , (unsigned int)date );
     702printf( "\n[convol] thread %d on core[%x,%d] load input file in A[%d]\n",
     703tid , cxy , lpid , cid );
    502704#endif
    503705
     
    505707
    506708    // Optionnal parallel display of the initial image stored in A[c] buffers.
    507     // Eah thread[x,y,p] displays (NL/nthreads) lines. (one byte per pixel).
     709    // Eah thread[cid,lid] displays (NL/nthreads) lines.
    508710
    509711    if ( INITIAL_DISPLAY_ENABLE )
     
    516718            line = offset + l;
    517719
     720            // copy TA[cid] to TZ[cid]
    518721            for ( p = 0 ; p < NP ; p++ )
    519722            {
    520                 TZ(cluster_id, line, p) = (unsigned char)(TA(cluster_id, line, p) >> 8);
     723                TZ(cid, line, p) = (unsigned char)(TA(cid, line, p) >> 8);
    521724            }
    522725
    523             if (fbf_write( &TZ(cluster_id, line, 0),                  // first pixel in TZ
    524                            NP,                                        // number of bytes
    525                            NP*(l + (thread_id * lines_per_thread))))  // offset in FBF
     726            // display one line to frame buffer
     727            if (fbf_write( &TZ(cid, line, 0),                     // first pixel in TZ
     728                           NP,                                    // number of bytes
     729                           NP*(l + (tid * lines_per_thread))))    // offset in FBF
    526730            {
    527731                printf("\n[convol error] in %s : thread[%x,%d] cannot access FBF\n",
     
    531735        }
    532736
    533 #if VERBOSE
     737#if VERBOSE_EXEC
    534738get_cycle( &date );
    535 printf( "\n[convol] thread[%x,%d] completes initial display at cycle %d\n",
    536 cxy , lid , (unsigned int)date );
     739printf( "\n[convol] thread[%d] on core[%x,%d] completes initial display\n",
     740tid , cxy , lpid );
    537741#endif
    538742
     
    543747    ////////////////////////////////////////////////////////////
    544748    // parallel horizontal filter :
    545     // B <= transpose(FH(A))
     749    // B <= convol(FH(A))
    546750    // D <= A - FH(A)
    547     // Each thread computes (NL/nthreads) lines 
     751    // Each thread computes (NL/nthreads) lines.
    548752    // The image must be extended :
    549     // if (z<0)    TA(cluster_id,l,z) == TA(cluster_id,l,0)
    550     // if (z>NP-1) TA(cluster_id,l,z) == TA(cluster_id,l,NP-1)
     753    // if (z<0)    TA(cid,l,z) == TA(cid,l,0)
     754    // if (z>NP-1) TA(cid,l,z) == TA(cid,l,NP-1)
    551755    ////////////////////////////////////////////////////////////
    552756
    553757    get_cycle( &date );
    554     H_BEG[cluster_id][lid] = (unsigned int)date;
    555 
    556 #if VERBOSE
    557 printf( "\n[convol] thread[%x,%d] starts horizontal filter at cycle %d\n",
    558 cxy , lid , (unsigned int)date );
     758    H_BEG[cid][lid] = (unsigned int)date;
     759
     760#if VERBOSE_EXEC
     761printf( "\n[convol] thread[%d] on core[%x,%d] starts horizontal filter\n",
     762tid , cxy , lpid );
    559763#else
    560 if ( (cxy == cxy_main) && (lid == lid_main) )
    561 printf( "\n[convol] thread[%x,%d] starts horizontal filter at cycle %d\n",
    562 cxy , lid , (unsigned int)date );
     764if ( tid == tid_main )
     765printf( "\n[convol] thread[%d] on core[%x,%d] starts horizontal filter\n",
     766tid , cxy , lpid );
    563767#endif
    564768
     
    566770    // first & last define which lines are handled by a given thread
    567771
    568     first = thread_id * lines_per_thread;
     772    first = tid * lines_per_thread;
    569773    last  = first + lines_per_thread;
    570774
     
    626830
    627831    get_cycle( &date );
    628     H_END[cluster_id][lid] = (unsigned int)date;
    629 
    630 #if VERBOSE
    631 printf( "\n[convol] thread[%x,%d] completes horizontal filter at cycle %d\n",
    632 cxy , lid, (unsigned int)date );
     832    H_END[cid][lid] = (unsigned int)date;
     833
     834#if VERBOSE_EXEC
     835printf( "\n[convol] thread[%d] on core[%x,%d] completes horizontal filter\n",
     836tid , cxy , lpid );
    633837#else
    634 if ( (cxy == cxy_main) && (lid == lid_main) )
    635 printf( "\n[convol] thread[%x,%d] completes horizontal filter at cycle %d\n",
    636 cxy , lid, (unsigned int)date );
     838if ( tid == tid_main )
     839printf( "\n[convol] thread[%d] on core[%x,%d] completes horizontal filter\n",
     840tid , cxy , lpid );
    637841#endif
    638842
     
    645849    // Each thread computes (NP/nthreads) columns
    646850    // The image must be extended :
    647     // if (l<0)    TB(cluster_id,p,l) == TB(cluster_id,p,0)
    648     // if (l>NL-1)   TB(cluster_id,p,l) == TB(cluster_id,p,NL-1)
     851    // if (l<0)    TB(cid,p,l) == TB(cid,p,0)
     852    // if (l>NL-1)   TB(cid,p,l) == TB(cid,p,NL-1)
    649853    ///////////////////////////////////////////////////////////////
    650854
    651855    get_cycle( &date );
    652     V_BEG[cluster_id][lid] = (unsigned int)date;
    653 
    654 #if VERBOSE
    655 printf( "\n[convol] thread[%x,%d] starts vertical filter at cycle %d\n",
    656 cxy , lid , (unsigned int)date );
     856    V_BEG[cid][lid] = (unsigned int)date;
     857
     858#if VERBOSE_EXEC
     859printf( "\n[convol] thread[%d] on core[%x,%d] starts vertical filter\n",
     860tid , cxy , lpid );
    657861#else
    658 if ( (cxy == cxy_main) && (lid == lid_main) )
    659 printf( "\n[convol] thread[%x,%d] starts vertical filter at cycle %d\n",
    660 cxy , lid, (unsigned int)date );
     862if ( tid == tid_main )
     863printf( "\n[convol] thread[%d] on core[%x,%d] starts vertical filter\n",
     864tid , cxy , lpid );
    661865#endif
    662866
     
    664868    // first & last define which pixels are handled by a given thread
    665869
    666     first = thread_id * pixels_per_thread;
     870    first = tid * pixels_per_thread;
    667871    last  = first + pixels_per_thread;
    668872
     
    740944
    741945    get_cycle( &date );
    742     V_END[cluster_id][lid] = (unsigned int)date;
    743 
    744 #if VERBOSE
    745 printf( "\n[convol] thread[%x,%d] completes vertical filter at cycle %d\n",
    746 cxy , lid , (unsigned int)date );
     946    V_END[cid][lid] = (unsigned int)date;
     947
     948#if VERBOSE_EXEC
     949printf( "\n[convol] thread[%d] on core[%x,%d] completes vertical filter\n",
     950tid , cxy , lid );
    747951#else
    748 if ( (cxy == cxy_main) && (lid == lid_main) )
    749 printf( "\n[convol] thread[%x,%d] completes vertical filter at cycle %d\n",
    750 cxy , lid, (unsigned int)date );
     952if ( tid == tid_main )
     953printf( "\n[convol] thread[%d] on core[%x,%d] completes vertical filter\n",
     954tid , cxy , lid );
    751955#endif
    752956
     
    755959
    756960    // Optional parallel display of the final image Z <= D + C
    757     // Eah thread[x,y,p] displays (NL/nthreads) lines. (one byte per pixel).
     961    // Eah thread[x,y,p] displays (NL/nthreads) lines.
    758962
    759963    if ( FINAL_DISPLAY_ENABLE )
    760964    {
    761965        get_cycle( &date );
    762         D_BEG[cluster_id][lid] = (unsigned int)date;
    763 
    764 #if VERBOSE
    765 printf( "\n[convol] thread[%x,%d] starts final display at cycle %d\n",
    766 cxy , lid , (unsigned int)date );
     966        D_BEG[cid][lid] = (unsigned int)date;
     967
     968#if VERBOSE_EXEC
     969printf( "\n[convol] thread[%d] on core[%x,%d] starts final display\n",
     970tid , cxy , lid );
    767971#else
    768 if ( (cxy == cxy_main) && (lid == lid_main) )
    769 printf( "\n[convol] thread[%x,%d] starts final display at cycle %d\n",
    770 cxy , lid, (unsigned int)date );
     972if ( tid == tid_main )
     973printf( "\n[convol] thread[%d] on core[%x,%d] starts final display\n",
     974tid , cxy , lid );
    771975#endif
    772976
     
    780984            for ( p = 0 ; p < NP ; p++ )
    781985            {
    782                 TZ(cluster_id, line, p) =
    783                    (unsigned char)( (TD(cluster_id, line, p) +
    784                                      TC(cluster_id, line, p) ) >> 8 );
     986                TZ(cid, line, p) =
     987                   (unsigned char)( (TD(cid, line, p) +
     988                                     TC(cid, line, p) ) >> 8 );
    785989            }
    786990
    787             if (fbf_write( &TZ(cluster_id, line, 0),                  // first pixel in TZ
    788                            NP,                                        // number of bytes
    789                            NP*(l + (thread_id * lines_per_thread))))  // offset in FBF
     991            if (fbf_write( &TZ(cid, line, 0),                   // first pixel in TZ
     992                           NP,                                  // number of bytes
     993                           NP*(l + (tid * lines_per_thread))))  // offset in FBF
    790994            {
    791                 printf("\n[convol error] in %s : thread[%d,%d,%d] cannot access FBF\n",
    792                 __FUNCTION__ , x , y , lid );
     995                printf("\n[convol error] thread[%d] cannot access FBF\n", tid );
    793996                pthread_exit( &THREAD_EXIT_FAILURE );
    794997            }
     
    796999
    7971000        get_cycle( &date );
    798         D_END[cluster_id][lid] = (unsigned int)date;
    799 
    800 #if VERBOSE
    801 printf( "\n[convol] thread[%x,%d] completes final display at cycle %d\n",
    802 cxy , lid , (unsigned int)date );
     1001        D_END[cid][lid] = (unsigned int)date;
     1002
     1003#if VERBOSE_EXEC
     1004printf( "\n[convol] thread[%d] on core[%x,%d] completes final display\n",
     1005tid , cxy , lid );
    8031006#else
    804 if ( (cxy == cxy_main) && (lid == lid_main) )
    805 printf( "\n[convol] thread[%x,%d] completes final display at cycle %d\n",
    806 cxy , lid , (unsigned int)date );
    807 #endif
    808      
    809         ////////////////////////////////
    810         pthread_barrier_wait( &barrier );
     1007if ( tid == tid_main )
     1008printf( "\n[convol] thread[%d] on core[%x,%d] completes final display\n",
     1009tid , cxy , lid );
     1010#endif
     1011
    8111012    }
    8121013
    8131014    // all threads (but the one executing main) exit
    814     if ( (cxy != cxy_main) || (lid != lid_main) )
     1015    if ( tid != tid_main )
    8151016    {
    8161017        pthread_exit( &THREAD_EXIT_SUCCESS );
     
    8211022
    8221023
    823 /////////////////////////////////////////
    824 void instrument( unsigned int nclusters,
    825                  unsigned int ncores )
     1024//////////////////////////
     1025void instrument( FILE * f,
     1026                 char * filename )
    8261027{
    827         unsigned int cc, pp;
    828 
    829         unsigned int min_start = 0xFFFFFFFF;
    830         unsigned int max_start = 0;
    831 
    832         unsigned int min_h_beg = 0xFFFFFFFF;
    833         unsigned int max_h_beg = 0;
    834 
    835         unsigned int min_h_end = 0xFFFFFFFF;
    836         unsigned int max_h_end = 0;
    837 
    838         unsigned int min_v_beg = 0xFFFFFFFF;
    839         unsigned int max_v_beg = 0;
    840 
    841         unsigned int min_v_end = 0xFFFFFFFF;
    842         unsigned int max_v_end = 0;
    843 
    844         unsigned int min_d_beg = 0xFFFFFFFF;
    845         unsigned int max_d_beg = 0;
    846 
    847         unsigned int min_d_end = 0xFFFFFFFF;
    848         unsigned int max_d_end = 0;
    849 
    850         for (cc = 0; cc < nclusters; cc++)
    851         {
    852             for (pp = 0; pp < ncores; pp++ )
    853             {
    854                 if (START[cc][pp] < min_start) min_start = START[cc][pp];
    855                 if (START[cc][pp] > max_start) max_start = START[cc][pp];
    856 
    857                 if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp];
    858                 if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp];
    859 
    860                 if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp];
    861                 if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp];
    862 
    863                 if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp];
    864                 if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp];
    865 
    866                 if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp];
    867                 if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp];
    868 
    869                 if (D_BEG[cc][pp] < min_d_beg) min_d_beg = D_BEG[cc][pp];
    870                 if (D_BEG[cc][pp] > max_d_beg) max_d_beg = D_BEG[cc][pp];
    871 
    872                 if (D_END[cc][pp] < min_d_end) min_d_end = D_END[cc][pp];
    873                 if (D_END[cc][pp] > max_d_end) max_d_end = D_END[cc][pp];
    874             }
    875         }
    876 
    877         printf(" - START : min = %d / max = %d / med = %d / delta = %d\n",
    878                min_start, max_start, (min_start+max_start)/2, max_start-min_start);
    879 
    880         printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
    881                min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
    882 
    883         printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n",
    884                min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
    885 
    886         printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
    887                min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
    888 
    889         printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n",
    890                min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
    891 
    892         printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
    893                min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg);
    894 
    895         printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n",
    896                min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end);
    897 
    898         printf( "\n General Scenario (Kcycles for each step)\n" );
    899         printf( " - BOOT OS           = %d\n", (min_start            )/1000 );
    900         printf( " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
    901         printf( " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
    902         printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
    903         printf( " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
    904         printf( " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 );
    905         printf( " - DISPLAY           = %d\n", (max_d_end - min_d_beg)/1000 );
    906 
    907         // TODO save these results on f_instrum
     1028    unsigned int nclusters = x_size * y_size;
     1029
     1030    unsigned int cc, pp;
     1031
     1032    unsigned int min_start = 0xFFFFFFFF;
     1033    unsigned int max_start = 0;
     1034
     1035    unsigned int min_h_beg = 0xFFFFFFFF;
     1036    unsigned int max_h_beg = 0;
     1037
     1038    unsigned int min_h_end = 0xFFFFFFFF;
     1039    unsigned int max_h_end = 0;
     1040
     1041    unsigned int min_v_beg = 0xFFFFFFFF;
     1042    unsigned int max_v_beg = 0;
     1043
     1044    unsigned int min_v_end = 0xFFFFFFFF;
     1045    unsigned int max_v_end = 0;
     1046
     1047    unsigned int min_d_beg = 0xFFFFFFFF;
     1048    unsigned int max_d_beg = 0;
     1049
     1050    unsigned int min_d_end = 0xFFFFFFFF;
     1051    unsigned int max_d_end = 0;
     1052
     1053    for (cc = 0; cc < nclusters; cc++)
     1054    {
     1055        for (pp = 0; pp < ncores; pp++ )
     1056        {
     1057            if (START[cc][pp] < min_start) min_start = START[cc][pp];
     1058            if (START[cc][pp] > max_start) max_start = START[cc][pp];
     1059
     1060            if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp];
     1061            if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp];
     1062
     1063            if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp];
     1064            if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp];
     1065
     1066            if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp];
     1067            if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp];
     1068
     1069            if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp];
     1070            if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp];
     1071
     1072            if (D_BEG[cc][pp] < min_d_beg) min_d_beg = D_BEG[cc][pp];
     1073            if (D_BEG[cc][pp] > max_d_beg) max_d_beg = D_BEG[cc][pp];
     1074
     1075            if (D_END[cc][pp] < min_d_end) min_d_end = D_END[cc][pp];
     1076            if (D_END[cc][pp] > max_d_end) max_d_end = D_END[cc][pp];
     1077        }
     1078    }
     1079
     1080    // display on terminal
     1081    printf( "\n ------ %s ------\n" , filename );
     1082
     1083    printf(" - START : min = %d / max = %d / med = %d / delta = %d\n",
     1084           min_start, max_start, (min_start+max_start)/2, max_start-min_start);
     1085
     1086    printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1087           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
     1088
     1089    printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n",
     1090           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
     1091
     1092    printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1093           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
     1094
     1095    printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n",
     1096           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
     1097
     1098    printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1099           min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg);
     1100
     1101    printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n",
     1102           min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end);
     1103
     1104    printf( "\n General Scenario (Kcycles for each step)\n" );
     1105    printf( " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
     1106    printf( " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
     1107    printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
     1108    printf( " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
     1109    printf( " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 );
     1110    printf( " - DISPLAY           = %d\n", (max_d_end - min_d_beg)/1000 );
     1111    printf( " \nSEQUENCIAL = %d / PARALLEL = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME );
     1112
     1113    // save on disk
     1114    fprintf( f ,  "\n ------ %s ------\n" , filename );
     1115
     1116    fprintf( f , " - START : min = %d / max = %d / med = %d / delta = %d\n",
     1117           min_start, max_start, (min_start+max_start)/2, max_start-min_start);
     1118
     1119    fprintf( f , " - H_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1120           min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg);
     1121
     1122    fprintf( f , " - H_END : min = %d / max = %d / med = %d / delta = %d\n",
     1123           min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end);
     1124
     1125    fprintf( f , " - V_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1126           min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg);
     1127
     1128    fprintf( f , " - V_END : min = %d / max = %d / med = %d / delta = %d\n",
     1129           min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end);
     1130
     1131    fprintf( f , " - D_BEG : min = %d / max = %d / med = %d / delta = %d\n",
     1132           min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg);
     1133
     1134    fprintf( f , " - D_END : min = %d / max = %d / med = %d / delta = %d\n",
     1135           min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end);
     1136
     1137    fprintf( f ,  "\n General Scenario (Kcycles)\n" );
     1138    fprintf( f ,  " - LOAD IMAGE        = %d\n", (min_h_beg - min_start)/1000 );
     1139    fprintf( f ,  " - H_FILTER          = %d\n", (max_h_end - min_h_beg)/1000 );
     1140    fprintf( f ,  " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 );
     1141    fprintf( f ,  " - V_FILTER          = %d\n", (max_v_end - min_v_beg)/1000 );
     1142    fprintf( f ,  " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 );
     1143    fprintf( f ,  " - DISPLAY           = %d\n", (max_d_end - min_d_beg)/1000 );
     1144    fprintf( f ,  " \nSEQUENCIAL = %d / PARALLEL = %d\n", SEQUENCIAL_TIME, PARALLEL_TIME );
    9081145
    9091146} // end instrument()
Note: See TracChangeset for help on using the changeset viewer.