source: trunk/user/transpose/transpose.c @ 657

Last change on this file since 657 was 657, checked in by alain, 4 years ago

Introduce remote_buf.c/.h & socket.c/.h files.
Update dev_nic.c/.h files.

File size: 37.8 KB
Line 
1//////////////////////////////////////////////////////////////////////////////////////////
2// File   : transpose.c   
3// Date   : september 2019
4// author : Alain Greiner
5//////////////////////////////////////////////////////////////////////////////////////////
6// This multi-threaded aplication read a raw image (one byte per pixel)
7// stored on disk, transposes it, displays the result on the frame buffer,
8// and stores the transposed image on disk.
9//
10// The image size and the pixel encoding type are defined by the IMAGE_SIZE and
11// IMAGE_TYPE global parameters.
12//
13// It can run on a multi-cores, multi-clusters architecture, where (X_SIZE * Y_SIZE)
14// is the number of clusters and NCORES the number of cores per cluster.
15// A core is identified by two indexes [cxy,lid] : cxy is the cluster identifier,
16// (that is NOT required to be a continuous index), and lid is the local core index,
17// (that must be in the [0,NCORES-1] range).
18//
19// The main() function can run on any core in any cluster. This main thread
20// makes the initialisations, load the input file to the "image_in" buffer,
21// launches the working threads, calls the instrument() function when all working
22// threads complete, and saves the result "image_out" buffer to the output file.
23//
24// The number of working threads is always defined by the number of cores availables
25// in the architecture, but this application supports three placement modes.
26// In all modes, the working threads are identified by the [tid] continuous index
27// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
28// This continuous index can always be decomposed in two continuous sub-indexes:
29// tid == cid * ncores + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
30//
31// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
32//   threads are created by the main thread, but the placement is done by the OS, using
33//   the DQDT for load balancing, and two working threads can be placed on the same core.
34//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
35//   cluster or a physical core. In this mode, the main thread run on any cluster,
36//   but has tid = 0 (i.e. cid = 0 & tid = 0).
37//
38// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of
39//   of the threads on the cores is explicitely controled by the main thread to have
40//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
41//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
42//   physical cluster identifier, and [lid] is the local core index.
43//
44// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
45//   non standard pthread_parallel_create() function to avoid the costly sequencial
46//   loops for pthread_create() and pthread_join(). It garanty one working thread
47//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
48//   
49// Each working thread[cid][lid] run the "execute" function, that uses the "buf_in" and
50// "buf_out" local buffers, containing the direct and transposed images:
51// Each thread[cid][0] allocates two buf_in[cid] and buf_out[cid] buffers, load from
52// "image_in" to buf_in[cid] all lines that must be handled by the threads sharing the
53// same cid, and finally save from buf_out[cid] to "image_out" all lines that have been
54// transposed to buf_out[cid].
55// Each thread[cid][lid] in the group defined by the cid index read pixels from the
56// local buf_in[cid] buffer, and write pixels to all remote // buf_out[cid] buffers.
57//
58// - The image  must fit the frame buffer size, that must be power of 2.
59// - The number of clusters  must be a power of 2 no larger than 256.
60// - The number of cores per cluster must be a power of 2 no larger than 4.
61// - The number of threads cannot be larger than IMAGE_SIZE.
62//
63//////////////////////////////////////////////////////////////////////////////////////////
64
65#include <sys/mman.h>
66#include <stdio.h>
67#include <stdlib.h>
68#include <unistd.h>
69#include <pthread.h>
70#include <string.h>
71#include <almosmkh.h>
72#include <fcntl.h>
73#include <hal_macros.h>
74
75#define X_MAX                 16                           // max number of clusters in row
76#define Y_MAX                 16                           // max number of clusters in column
77#define CORES_MAX             4                            // max number of cores per cluster
78#define CLUSTERS_MAX          (X_MAX * Y_MAX)              // max number of clusters
79#define THREADS_MAX           (X_MAX * Y_MAX * CORES_MAX)  // max number of threads
80
81#define IMAGE_TYPE            420                          // pixel encoding type
82
83//#define IMAGE_SIZE            128                          // image size
84//#define INPUT_FILE_PATH       "/misc/images_128.raw"       // input file pathname
85//#define OUTPUT_FILE_PATH      "/misc/transposed_128.raw"   // output file pathname
86
87//#define IMAGE_SIZE            256                          // image size
88//#define INPUT_FILE_PATH       "/misc/lena_256.raw"         // input file pathname
89#//define OUTPUT_FILE_PATH      "/misc/transposed_256.raw"   // output file pathname
90
91//#define IMAGE_SIZE            512                          // image size
92//#define INPUT_FILE_PATH       "/misc/couple_512.raw"       // input file pathname
93//#define OUTPUT_FILE_PATH      "/misc/transposed_512.raw"   // output file pathname
94
95#define IMAGE_SIZE            1024                         // image size
96#define INPUT_FILE_PATH       "/misc/philips_1024.raw"     // input file pathname
97#define OUTPUT_FILE_PATH      "/misc/transposed_1024.raw"  // output file pathname
98
99#define SAVE_RESULT_FILE      0                            // save result image on disk
100#define USE_DQT_BARRIER       0                            // quad-tree barrier if non zero
101
102#define NO_PLACEMENT          0                            // uncontrolefdthread placement
103#define EXPLICIT_PLACEMENT    1                            // explicit threads placement
104#define PARALLEL_PLACEMENT    0                            // parallel threads placement
105
106#define VERBOSE_MAIN          1                            // main function print comments
107#define VERBOSE_MAIN_DETAILED 0                            // main function print comments
108#define VERBOSE_EXEC          1                            // exec function print comments
109
110
111///////////////////////////////////////////////////////
112//                global variables
113///////////////////////////////////////////////////////
114
115// global instrumentation counters for the main thread
116unsigned int SEQUENCIAL_TIME = 0;
117unsigned int PARALLEL_TIME   = 0;
118
119// instrumentation counters for each thread in each cluster
120// indexed by [cid][lid] : cluster continuous index / thread local index
121unsigned int ALOC_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
122unsigned int ALOC_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
123unsigned int LOAD_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
124unsigned int LOAD_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
125unsigned int TRSP_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
126unsigned int TRSP_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
127unsigned int SAVE_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
128unsigned int SAVE_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
129unsigned int FREE_START[CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
130unsigned int FREE_END  [CLUSTERS_MAX][CORES_MAX] = {{ 0 }};
131
132// buffer containing the input image, loaded by the main from input file
133unsigned char  image_in[IMAGE_SIZE * IMAGE_SIZE];
134
135// buffer containing the output image, saved by the main to output file
136unsigned char  image_out[IMAGE_SIZE * IMAGE_SIZE];
137
138// arrays of pointers on distributed buffers indexed by [cid]
139unsigned char *  buf_in [CLUSTERS_MAX];
140unsigned char *  buf_out[CLUSTERS_MAX];
141
142// pointer and identifier for dynamically allocated FBF window
143void   *  win_buf;
144int       wid;
145
146// synchronisation barrier (all working threads)
147pthread_barrier_t   barrier;
148
149// platform parameters
150unsigned int  x_size;              // number of clusters in a row
151unsigned int  y_size;              // number of clusters in a column
152unsigned int  ncores;              // number of cores per cluster
153
154// main thread continuous index
155unsigned int     tid_main; 
156
157//return values at thread exit
158unsigned int THREAD_EXIT_SUCCESS = 0;
159unsigned int THREAD_EXIT_FAILURE = 1;
160
161// array of kernel thread identifiers / indexed by [tid]
162pthread_t                     exec_trdid[THREADS_MAX];   
163
164// array of execute function arguments / indexed by [tid]
165pthread_parallel_work_args_t  exec_args[THREADS_MAX];
166
167// array of thread attributes / indexed by [tid]
168pthread_attr_t                exec_attr[THREADS_MAX];
169
170////////////////////////////////////////////////////////////////
171//             functions declaration
172////////////////////////////////////////////////////////////////
173
174void * execute( void * arguments );
175
176void instrument( FILE * f , char * filename );
177
178////////////////
179int main( void )
180{
181    unsigned long long start_cycle;
182    unsigned long long end_sequencial_cycle;
183    unsigned long long end_parallel_cycle;
184
185    char               filename[32];      // instrumentation file name
186    char               pathname[64];      // instrumentation file pathname
187
188    int error;
189
190    /////////////////////////////////////////////////////////////////////////////////
191    get_cycle( &start_cycle );
192    /////////////////////////////////////////////////////////////////////////////////
193
194    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
195    {
196        printf("\n[transpose error] illegal placement\n");
197        exit( 0 );
198    }
199
200    // get & check plat-form parameters
201    get_config( &x_size,
202                &y_size,
203                &ncores );
204
205    if((ncores != 1) && (ncores != 2) && (ncores != 4))
206    {
207        printf("\n[transpose error] number of cores per cluster must be 1/2/4\n");
208        exit( 0 );
209    }
210
211    if( (x_size != 1) && (x_size != 2) && (x_size != 4) && 
212        (x_size != 8) && (x_size != 16) )
213    {
214        printf("\n[transpose error] x_size must be 1/2/4/8/16\n");
215        exit( 0 );
216    }
217       
218    if( (y_size != 1) && (y_size != 2) && (y_size != 4) && 
219        (y_size != 8) && (y_size != 16) )
220    {
221        printf("\n[transpose error] y_size must be 1/2/4/8/16\n");
222        exit( 0 );
223    }
224       
225    // get identifiers for core executing main
226    unsigned int  cxy_main;
227    unsigned int  lid_main;
228    get_core_id( &cxy_main , &lid_main );
229
230    // compute number of threads
231    unsigned int nclusters = x_size * y_size;
232    unsigned int nthreads  = nclusters * ncores;
233
234    if( nthreads > IMAGE_SIZE )
235    {
236        printf("\n[transpose error] number of threads larger than number of lines\n");
237        exit( 0 );
238    }
239
240    // get FBF size and type
241    unsigned int   fbf_width;
242    unsigned int   fbf_height;
243    unsigned int   fbf_type;
244    fbf_get_config( &fbf_width , &fbf_height , &fbf_type );
245
246    if( (fbf_width < IMAGE_SIZE) || (fbf_height < IMAGE_SIZE) || (fbf_type != IMAGE_TYPE) )
247    {
248        printf("\n[transpose error] image does not fit FBF size or type\n");
249        exit( 0 );
250    }
251
252    // define total number of pixels
253    int npixels = IMAGE_SIZE * IMAGE_SIZE;
254
255    // define instrumentation file name
256    if( NO_PLACEMENT )
257    {
258        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / NO_PLACE\n",
259        nclusters, ncores, INPUT_FILE_PATH , getpid() );
260
261        // build instrumentation file name
262        if( USE_DQT_BARRIER )
263        snprintf( filename , 32 , "trsp_dqt_no_place_%d_%d_%d",
264        IMAGE_SIZE , x_size * y_size , ncores );
265        else
266        snprintf( filename , 32 , "trsp_smp_no_place_%d_%d_%d",
267        IMAGE_SIZE , x_size * y_size , ncores );
268    }
269
270    if( EXPLICIT_PLACEMENT )
271    {
272        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / EXPLICIT\n",
273        nclusters, ncores, INPUT_FILE_PATH , getpid() );
274
275        // build instrumentation file name
276        if( USE_DQT_BARRIER )
277        snprintf( filename , 32 , "trsp_dqt_explicit_%d_%d_%d",
278        IMAGE_SIZE , x_size * y_size , ncores );
279        else
280        snprintf( filename , 32 , "trsp_smp_explicit_%d_%d_%d",
281        IMAGE_SIZE , x_size * y_size , ncores );
282    }
283
284    if( PARALLEL_PLACEMENT )
285    {
286        printf("\n[transpose] %d cluster(s) / %d core(s) / <%s> / PID %x / PARALLEL\n",
287        nclusters, ncores, INPUT_FILE_PATH , getpid() );
288
289        // build instrumentation file name
290        if( USE_DQT_BARRIER )
291        snprintf( filename , 32 , "trsp_dqt_parallel_%d_%d_%d",
292        IMAGE_SIZE , x_size * y_size , ncores );
293        else
294        snprintf( filename , 32 , "trsp_smp_parallel_%d_%d_%d",
295        IMAGE_SIZE , x_size * y_size , ncores );
296    }
297
298    // open a window in FBF
299    wid = fbf_create_window( 0,             // l_zero
300                             0,             // p_zero
301                             IMAGE_SIZE,    // lines
302                             IMAGE_SIZE,    // pixels
303                             &win_buf );
304    if( wid < 0) 
305    {
306        printf("\n[transpose error] cannot open FBF window\n");
307        exit( 0 );
308    }
309
310#if  VERBOSE_MAIN
311printf("\n[transpose] main on core[%x,%d] created FBF window %d / buffer %x\n",
312cxy_main, lid_main, wid , win_buf );
313#endif
314
315    // open instrumentation file
316    snprintf( pathname , 64 , "/home/%s", filename );
317    FILE * f = fopen( pathname , NULL );
318
319    if ( f == NULL ) 
320    { 
321        printf("\n[transpose error] cannot open instru file %s\n", pathname );
322        exit( 0 );
323    }
324
325#if  VERBOSE_MAIN
326printf("\n[transpose] main on core[%x,%d] open instrumentation file %s\n",
327cxy_main, lid_main, pathname );
328#endif
329
330    // main thread initializes barrier
331    if( USE_DQT_BARRIER )
332    {
333        pthread_barrierattr_t attr;
334        attr.x_size   = x_size;
335        attr.y_size   = y_size;
336        attr.nthreads = ncores;
337        error = pthread_barrier_init( &barrier, &attr , nthreads );
338    }
339    else
340    {
341        error = pthread_barrier_init( &barrier, NULL , nthreads );
342    }
343
344    if( error )
345    { 
346        printf("\n[transpose error] main cannot initialize barrier\n" );
347        exit( 0 );
348    }
349
350#if  VERBOSE_MAIN
351printf("\n[transpose] main on core[%x,%d] completed barrier initialisation\n",
352cxy_main, lid_main );
353#endif
354
355    // open input file
356    int fd_in = open( INPUT_FILE_PATH , O_RDONLY , 0 ); 
357
358    if ( fd_in < 0 ) 
359    { 
360        printf("\n[transpose error] main cannot open file %s\n", INPUT_FILE_PATH );
361        exit( 0 );
362    }
363
364#if  VERBOSE_MAIN
365printf("\n[transpose] main open file <%s> / fd = %d\n", INPUT_FILE_PATH , fd_in );
366#endif
367
368    // open output file
369    int fd_out = open( OUTPUT_FILE_PATH , O_CREAT , 0 ); 
370
371    if ( fd_out < 0 ) 
372    { 
373        printf("\n[transpose error] main cannot open file %s\n", OUTPUT_FILE_PATH );
374        exit( 0 );
375    }
376
377    // move input image to input buffer
378    if( read( fd_in , image_in , npixels ) != npixels )
379    {
380        printf("\n[transpose error] main cannot read input image\n");
381        exit( 0 );
382    }
383
384#if  VERBOSE_MAIN
385printf("\n[transpose] main moved file <%s> to buf_in\n", INPUT_FILE_PATH );
386#endif
387
388    /////////////////////////////////////////////////////////////////////////////////////
389    get_cycle( &end_sequencial_cycle );
390    SEQUENCIAL_TIME = (unsigned int)(end_sequencial_cycle - start_cycle);
391    /////////////////////////////////////////////////////////////////////////////////////
392
393    //////////////////
394    if( NO_PLACEMENT )
395    {
396        // the tid value for the main thread is always 0
397        // main thread creates new threads with tid in [1,nthreads-1] 
398        unsigned int tid;
399        for ( tid = 0 ; tid < nthreads ; tid++ )
400        {
401            // register tid value in exec_args[tid] array
402            exec_args[tid].tid = tid;
403           
404            // create other threads
405            if( tid > 0 )
406            {
407                if ( pthread_create( &exec_trdid[tid], 
408                                     NULL,                  // no attribute
409                                     &execute,
410                                     &exec_args[tid] ) ) 
411                {
412                    printf("\n[transpose error] cannot create thread %d\n", tid );
413                    exit( 0 );
414                }
415
416#if VERBOSE_MAIN_DETAILED
417printf("\n[transpose] main created thread %d\n", tid );
418#endif
419
420            }
421            else
422            {
423                tid_main = 0;
424            }
425        }  // end for tid
426
427        // main thread calls itself the execute() function
428        execute( &exec_args[0] );
429
430        // main thread wait other threads completion
431        for ( tid = 1 ; tid < nthreads ; tid++ )
432        {
433            unsigned int * status;
434
435            // main wait thread[tid] status
436            if ( pthread_join( exec_trdid[tid], (void*)(&status)) )
437            {
438                printf("\n[transpose error] main cannot join thread %d\n", tid );
439                exit( 0 );
440            }
441       
442            // check status
443            if( *status != THREAD_EXIT_SUCCESS )
444            {
445                printf("\n[transpose error] thread %x returned failure\n", tid );
446                exit( 0 );
447            }
448
449#if VERBOSE_MAIN_DETAILED
450printf("\n[transpose] main joined thread %x\n", tid );
451#endif
452       
453        }  // end for tid
454
455    }  // end if no_placement
456
457    ////////////////////////
458    if( EXPLICIT_PLACEMENT )
459    {
460        // main thread places each other threads on a specific core[cxy][lid]
461        // but the actual thread creation is sequencial
462        unsigned int x;
463        unsigned int y;
464        unsigned int l;
465        unsigned int cxy;                   // cluster identifier
466        unsigned int tid;                   // thread continuous index
467
468        for( x = 0 ; x < x_size ; x++ )
469        {
470            for( y = 0 ; y < y_size ; y++ )
471            {
472                cxy = HAL_CXY_FROM_XY( x , y );
473                for( l = 0 ; l < ncores ; l++ )
474                {
475                    // compute thread continuous index
476                    tid = (((* y_size) + y) * ncores) + l;
477
478                    // register tid value in exec_args[tid] array
479                    exec_args[tid].tid = tid;
480
481                    // no thread created on the core running the main
482                    if( (cxy != cxy_main) || (l != lid_main) )
483                    {
484                        // define thread attributes
485                        exec_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
486                                                    PT_ATTR_CORE_DEFINED;
487                        exec_attr[tid].cxy        = cxy;
488                        exec_attr[tid].lid        = l;
489 
490                        // create thread[tid] on core[cxy][l]
491                        if ( pthread_create( &exec_trdid[tid],   
492                                             &exec_attr[tid],   
493                                             &execute,
494                                             &exec_args[tid] ) )       
495                        {
496                            printf("\n[transpose error] cannot create thread %d\n", tid );
497                            exit( 0 );
498                        }
499
500#if VERBOSE_MAIN_DETAILED
501printf("\n[transpose] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
502#endif
503                    }
504                    else
505                    {
506                        tid_main = tid;
507                    }
508                }
509            }
510        }
511
512        // main thread calls itself the execute() function
513        execute( &exec_args[tid_main] );
514
515        // main thread wait other threads completion
516        for( tid = 0 ; tid < nthreads ; tid++ )
517        {
518            // no other thread on the core running the main
519            if( tid != tid_main )
520            {
521                unsigned int * status;
522
523                // wait thread[tid]
524                if( pthread_join( exec_trdid[tid] , (void*)(&status) ) )
525                {
526                    printf("\n[transpose error] main cannot join thread %d\n", tid );
527                    exit( 0 );
528                }
529       
530                // check status
531                if( *status != THREAD_EXIT_SUCCESS )
532                {
533                    printf("\n[transpose error] thread %d returned failure\n", tid );
534                    exit( 0 );
535                }
536
537#if VERBOSE_MAIN_DETAILED
538printf("\n[transpose] main joined thread %d\n", tid );
539#endif
540            }
541        }
542    }  // end if explicit_placement
543
544    ////////////////////////
545    if( PARALLEL_PLACEMENT )
546    {
547        // compute covering DQT size an level
548        unsigned int z          = (x_size > y_size) ? x_size : y_size;
549        unsigned int root_level = ((z == 1) ? 0 : 
550                                  ((z == 2) ? 1 : 
551                                  ((z == 4) ? 2 : 
552                                  ((z == 8) ? 3 : 4))));
553
554        // create & execute the working threads
555        if( pthread_parallel_create( root_level , &execute ) )
556        {
557            printf("\n[transpose error] in %s\n", __FUNCTION__ );
558            exit( 0 );
559        }
560    }  // end if parallel_placement
561
562
563    /////////////////////////////////////////////////////////////////////////////
564    get_cycle( &end_parallel_cycle );
565    PARALLEL_TIME = (unsigned int)(end_parallel_cycle - end_sequencial_cycle);
566    /////////////////////////////////////////////////////////////////////////////
567
568    // register instrumentation results
569    instrument( f , filename );
570
571#if VERBOSE_MAIN
572printf("\n[transpose] main completed instrumentation\n");
573#endif
574
575/*
576    printf("\n> ");
577    getchar();
578
579    // move window
580    if( fbf_move_window( wid , 100 , 100 ) )
581    {
582        printf("\n[transpose error] main cannot move FBF window\n");
583        exit( 0 );
584    }
585
586    printf("\n> ");
587    getchar();
588*/   
589    // save image_out to output file
590    if( write( fd_out , image_out , npixels ) != npixels )
591    {
592        printf("\n[transpose error] main cannot write output image\n");
593        exit( 0 );
594    }
595
596#if VERBOSE_MAIN
597printf("\n[transpose] main saved buf_out to output file\n");
598#endif
599
600    // close input file
601    close( fd_in );
602
603#if VERBOSE_MAIN
604printf("\n[transpose] main closed input file\n");
605#endif
606
607    // close output file
608    close( fd_out );
609
610#if VERBOSE_MAIN
611printf("\n[transpose] main closed output file\n");
612#endif
613
614    // close instrumentation file
615    fclose( f );
616
617#if VERBOSE_MAIN
618printf("\n[transpose] main closed instrumentation file\n");
619#endif
620
621    // delete FBF window
622    if( fbf_delete_window( wid ) )
623    {
624        printf("\n[transpose error] main cannot delete FBF window\n");
625        exit( 0 );
626    }
627
628    // main thread suicide
629    exit( 0 );
630   
631    return 0;
632
633} // end main()
634
635
636
637
638//////////////////////////////////
639void * execute( void * arguments ) 
640{
641    unsigned long long   date;
642    unsigned int         l;         // line index for loop
643    unsigned int         p;         // pixel index for loop
644    int                  error;
645
646    unsigned char      * wbuf = win_buf;
647 
648    pthread_parallel_work_args_t * args = (pthread_parallel_work_args_t *)arguments;
649
650    // WARNING
651    //A thread is identified by the tid index, defined in the "args" structure.
652    // This index being in range [0,nclusters*ncores-1] we can always write
653    //       tid == cid * ncores + lid
654    // with cid in [0,nclusters-1] and lid in [0,ncores-1].
655    // if NO_PLACEMENT, there is no relation between these
656    // thread [cid][lid] indexes, and the core coordinates [cxy][lpid]
657
658    // get thread abstract identifiers
659    unsigned int tid = args->tid;
660    unsigned int cid = tid / ncores;    // abstract cluster index
661    unsigned int lid = tid % ncores;    // local thread index
662
663#if VERBOSE_EXEC
664unsigned int cxy;
665unsigned int lpid;
666get_core_id( &cxy , &lpid );   // get core physical identifiers
667#endif
668
669#if VERBOSE_EXEC
670printf("\n[transpose] exec[%d] on core[%x,%d] enters parallel exec\n",
671tid , cxy , lpid );
672#endif
673
674    get_cycle( &date );
675    ALOC_START[cid][lid] = (unsigned int)date;
676
677    // compute total number of pixels per image
678    unsigned int npixels = IMAGE_SIZE * IMAGE_SIZE;     
679
680    // compute total number of threads and clusters
681    unsigned int nclusters = x_size * y_size;
682    unsigned int nthreads  = nclusters * ncores;
683
684    // compute number of pixels per cid & per thread
685    unsigned int pixels_per_cid = npixels / nclusters;
686    unsigned int pixels_per_lid = pixels_per_cid / ncores;
687
688    // compute first and last line per thread
689    unsigned int lines_per_cid = pixels_per_cid / IMAGE_SIZE;
690    unsigned int lines_per_lid = pixels_per_lid / IMAGE_SIZE;
691
692    unsigned int line_first = (cid * lines_per_cid) + (lid * lines_per_lid);
693    unsigned int line_last  = line_first + lines_per_lid;
694
695    // Each thread[cid,0] allocates two local buffers, and register the base
696    // adresses in the global variable buf_in_ptr[cid] & buf_out_ptr[cid].
697   
698    if( lid == 0 )
699    {
700        // allocate buf_in
701        buf_in[cid] = (unsigned char *)malloc( pixels_per_cid );
702
703        if( buf_in[cid] == NULL )
704        {
705            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
706            pthread_exit( &THREAD_EXIT_FAILURE );
707        }
708
709#if VERBOSE_EXEC
710printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_in = %x\n",
711tid , cxy , lpid , buf_in );
712#endif
713
714        // allocate buf_out
715        buf_out[cid] = (unsigned char *)malloc( pixels_per_cid );
716
717        if( buf_out[cid] == NULL )
718        {
719            printf("\n[transpose error] thread[%d] cannot allocate buf_in\n", tid );
720            pthread_exit( &THREAD_EXIT_FAILURE );
721        }
722
723#if VERBOSE_EXEC
724printf("\n[transpose] exec[%d] on core[%x,%d] allocated buf_out = %x\n",
725tid , cxy , lpid , buf_out );
726#endif
727
728    }
729
730    get_cycle( &date );
731    ALOC_END[cid][lid] = (unsigned int)date;
732
733    /////////////////////////////////
734    pthread_barrier_wait( &barrier );
735    /////////////////////////////////
736
737    get_cycle( &date );
738    LOAD_START[cid][lid] = (unsigned int)date;
739
740    // all threads copy relevant part of the image_in to buf_in[cid]
741    memcpy( buf_in[cid] + (lid * pixels_per_lid), 
742            image_in + (cid * pixels_per_cid) + (lid * pixels_per_lid),
743            pixels_per_lid );
744
745#if VERBOSE_EXEC
746printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_in[%d]\n",
747tid , cxy , lpid , cid );
748#endif
749
750    // all local threads copy part of buf_in[cid] to FBF window for display
751    memcpy( wbuf + (cid * pixels_per_cid) + (lid * pixels_per_lid),
752            buf_in[cid] + (lid * pixels_per_lid),
753            pixels_per_lid );
754
755#if  VERBOSE_EXEC
756printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_in to FBF (first %d / last %d)\n",
757tid , cxy , lpid , line_first , line_last );
758#endif
759
760    // retresh window
761    error = fbf_refresh_window( wid , line_first , line_last );
762
763    if( error )
764    {
765        printf("\n[transpose error] exec[%d] cannot refresh FBF window\n", tid );
766        exit( 0 );
767    }
768
769    get_cycle( &date );
770    LOAD_END[cid][lid] = (unsigned int)date;
771
772    /////////////////////////////////
773    pthread_barrier_wait( &barrier );
774    /////////////////////////////////
775
776    get_cycle( &date );
777    TRSP_START[cid][lid] = (unsigned int)date;
778
779    // All threads contribute to parallel transpose from buf_in to buf_out:
780    // each thread makes the transposition for nlt lines (nlt = npixels/nthreads)
781    // from line [tid*nlt] to line [(tid + 1)*nlt - 1]
782    // (p,l) are the absolute pixel coordinates in the source image
783    // (l,p) are the absolute pixel coordinates in the dest image
784
785    unsigned int nlt   = IMAGE_SIZE / nthreads;    // number of lines per thread
786    unsigned int nlc   = IMAGE_SIZE / nclusters;   // number of lines per cluster
787
788    unsigned int src_cid;
789    unsigned int src_index;
790    unsigned int dst_cid;
791    unsigned int dst_index;
792
793    unsigned char byte;
794
795    unsigned int first = tid * nlt;        // first line index for a given thread
796    unsigned int last  = first + nlt;      // last line index for a given thread
797
798    // loop on lines handled by this thread
799    for ( l = first ; l < last ; l++ )
800    {
801        // loop on pixels in one line (one pixel per iteration)
802        for ( p = 0 ; p < IMAGE_SIZE ; p++ )
803        {
804            // read one byte from local buf_in
805            src_cid   = l / nlc;
806            src_index = (l % nlc) * IMAGE_SIZE + p;
807
808            byte = buf_in[src_cid][src_index];
809
810            // write one byte to remote buf_out
811            dst_cid   = p / nlc; 
812            dst_index = (p % nlc) * IMAGE_SIZE + l;
813
814            buf_out[dst_cid][dst_index] = byte;
815        }
816    }
817
818#if VERBOSE_EXEC
819printf("\n[transpose] exec[%d] on core[%x,%d] completes transpose\n",
820tid , cxy , lpid );
821#endif
822
823    get_cycle( &date );
824    TRSP_END[cid][lid] = (unsigned int)date;
825
826    /////////////////////////////////
827    pthread_barrier_wait( &barrier );
828    /////////////////////////////////
829
830    get_cycle( &date );
831    SAVE_START[cid][lid] = (unsigned int)date;
832
833    // each local threads copy part of buf_out[cid] to FBF window for display
834    memcpy( wbuf + (cid * pixels_per_cid) + (lid * pixels_per_lid),
835            buf_out[cid] + (lid * pixels_per_lid),
836            pixels_per_lid );
837
838#if  VERBOSE_EXEC
839printf("\n[transpose] exec[%d] on core[%x,%d] loaded buf_out to FBF (first %d / last %d)\n",
840tid , cxy , lpid , line_first , line_last );
841#endif
842
843    // refresh window
844    error = fbf_refresh_window( wid , line_first , line_last );
845
846    if( error )
847    {
848        printf("\n[transpose error] exec[%d] cannot refresh FBF window\n", tid );
849        exit( 0 );
850    }
851
852    // each local thread copy relevant part of buf_out to image_out
853    memcpy( image_out + (cid * pixels_per_cid) + (lid * pixels_per_lid),
854            buf_out[cid] + (lid * pixels_per_lid),
855            pixels_per_lid );
856
857#if VERBOSE_EXEC
858printf("\n[transpose] exec[%d] on core[%x,%d] saved buf_out[%d]\n",
859tid , cxy , lpid , cid );
860#endif
861
862    get_cycle( &date );
863    SAVE_END[cid][lid] = (unsigned int)date;
864
865    /////////////////////////////////
866    pthread_barrier_wait( &barrier );
867    /////////////////////////////////
868
869    get_cycle( &date );
870    FREE_START[cid][lid] = (unsigned int)date;
871
872    // Each thread[cid,0] release local buffers buf_in & buf_out
873
874    if( lid == 0 )
875    {
876        // release local buffers
877        free( buf_in[cid] );
878        free( buf_out[cid] );
879
880#if VERBOSE_EXEC
881printf("\n[transpose] exec[%d] on core[%x,%d] released buf_in & buf_out\n",
882tid , cxy , lpid );
883#endif
884
885    }
886
887    get_cycle( &date );
888    FREE_END[cid][lid] = (unsigned int)date;
889
890    /////////////////////////////////
891    pthread_barrier_wait( &barrier );
892    /////////////////////////////////
893   
894    // thread termination depends on the placement policy
895    if( PARALLEL_PLACEMENT )   
896    {
897        // <work> threads are runing in detached mode, and
898        // each thread must signal completion by calling barrier
899        // passed in arguments before exit
900
901        pthread_barrier_wait( args->barrier );
902
903        pthread_exit( &THREAD_EXIT_SUCCESS );
904    }
905    else
906    {
907        // <work> threads are running in attached mode
908        // each thread, but de main, simply exit
909        if ( tid != tid_main ) 
910        {
911
912#if VERBOSE_EXEC
913printf("\n[transpose] exec[%d] on core[%x,%d] exit\n",
914tid , cxy , lpid );
915#endif
916            pthread_exit( &THREAD_EXIT_SUCCESS );
917        }
918    }
919
920    return NULL;
921
922} // end execute()
923
924
925
926//////////////////////////
927void instrument( FILE * f,
928                 char * filename )
929{
930    unsigned int cid;
931    unsigned int l;
932
933    unsigned int min_aloc_start = 0xFFFFFFFF;
934    unsigned int max_aloc_start = 0;
935    unsigned int min_aloc_ended = 0xFFFFFFFF;
936    unsigned int max_aloc_ended = 0;
937    unsigned int min_load_start = 0xFFFFFFFF;
938    unsigned int max_load_start = 0;
939    unsigned int min_load_ended = 0xFFFFFFFF;
940    unsigned int max_load_ended = 0;
941    unsigned int min_trsp_start = 0xFFFFFFFF;
942    unsigned int max_trsp_start = 0;
943    unsigned int min_trsp_ended = 0xFFFFFFFF;
944    unsigned int max_trsp_ended = 0;
945    unsigned int min_save_start = 0xFFFFFFFF;
946    unsigned int max_save_start = 0;
947    unsigned int min_save_ended = 0xFFFFFFFF;
948    unsigned int max_save_ended = 0;
949    unsigned int min_free_start = 0xFFFFFFFF;
950    unsigned int max_free_start = 0;
951    unsigned int min_free_ended = 0xFFFFFFFF;
952    unsigned int max_free_ended = 0;
953 
954    for (cid = 0; cid < (x_size * y_size) ; cid++)
955    {
956        for ( l = 0 ; l < ncores ; l++ )
957        {
958            if (ALOC_START[cid][l] < min_aloc_start)  min_aloc_start = ALOC_START[cid][l];
959            if (ALOC_START[cid][l] > max_aloc_start)  max_aloc_start = ALOC_START[cid][l];
960            if (ALOC_END[cid][l]   < min_aloc_ended)  min_aloc_ended = ALOC_END[cid][l]; 
961            if (ALOC_END[cid][l]   > max_aloc_ended)  max_aloc_ended = ALOC_END[cid][l];
962            if (LOAD_START[cid][l] < min_load_start)  min_load_start = LOAD_START[cid][l];
963            if (LOAD_START[cid][l] > max_load_start)  max_load_start = LOAD_START[cid][l];
964            if (LOAD_END[cid][l]   < min_load_ended)  min_load_ended = LOAD_END[cid][l]; 
965            if (LOAD_END[cid][l]   > max_load_ended)  max_load_ended = LOAD_END[cid][l];
966            if (TRSP_START[cid][l] < min_trsp_start)  min_trsp_start = TRSP_START[cid][l];
967            if (TRSP_START[cid][l] > max_trsp_start)  max_trsp_start = TRSP_START[cid][l];
968            if (TRSP_END[cid][l]   < min_trsp_ended)  min_trsp_ended = TRSP_END[cid][l];
969            if (TRSP_END[cid][l]   > max_trsp_ended)  max_trsp_ended = TRSP_END[cid][l];
970            if (SAVE_START[cid][l] < min_save_start)  min_save_start = SAVE_START[cid][l];
971            if (SAVE_START[cid][l] > max_save_start)  max_save_start = SAVE_START[cid][l];
972            if (SAVE_END[cid][l]   < min_save_ended)  min_save_ended = SAVE_END[cid][l];
973            if (SAVE_END[cid][l]   > max_save_ended)  max_save_ended = SAVE_END[cid][l];
974            if (FREE_START[cid][l] < min_free_start)  min_free_start = FREE_START[cid][l];
975            if (FREE_START[cid][l] > max_free_start)  max_free_start = FREE_START[cid][l];
976            if (FREE_END[cid][l]   < min_free_ended)  min_free_ended = FREE_END[cid][l];
977            if (FREE_END[cid][l]   > max_free_ended)  max_free_ended = FREE_END[cid][l];
978        }
979    }
980
981    printf( "\n ------ %s ------\n" , filename );
982    fprintf( f , "\n ------ %s ------\n" , filename );
983
984    printf( " - ALOC_START : min = %d / max = %d / delta = %d\n",
985           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
986    fprintf( f , " - ALOC_START : min = %d / max = %d / delta = %d\n",
987           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
988
989    printf( " - ALOC_END   : min = %d / max = %d / delta = %d\n",
990           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
991    fprintf( f , " - ALOC_END   : min = %d / max = %d / delta = %d\n",
992           min_aloc_start, max_aloc_start, max_aloc_start-min_aloc_start ); 
993
994    printf( " - LOAD_START : min = %d / max = %d / delta = %d\n",
995           min_load_start, max_load_start, max_load_start-min_load_start ); 
996    fprintf( f , " - LOAD_START : min = %d / max = %d / delta = %d\n",
997           min_load_start, max_load_start, max_load_start-min_load_start ); 
998
999    printf( " - LOAD_END   : min = %d / max = %d / delta = %d\n",
1000           min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 
1001    fprintf( f , " - LOAD_END   : min = %d / max = %d / delta = %d\n",
1002           min_load_ended, max_load_ended, max_load_ended-min_load_ended ); 
1003
1004    printf( " - TRSP_START : min = %d / max = %d / delta = %d\n",
1005           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 
1006    fprintf( f , " - TRSP_START : min = %d / max = %d / delta = %d\n",
1007           min_trsp_start, max_trsp_start, max_trsp_start-min_trsp_start ); 
1008
1009    printf( " - TRSP_END   : min = %d / max = %d / delta = %d\n",
1010           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 
1011    fprintf( f , " - TRSP_END   : min = %d / max = %d / delta = %d\n",
1012           min_trsp_ended, max_trsp_ended, max_trsp_ended-min_trsp_ended ); 
1013
1014    printf( " - SAVE_START : min = %d / max = %d / delta = %d\n",
1015           min_save_start, max_save_start, max_save_start-min_save_start ); 
1016    fprintf( f , " - SAVE_START : min = %d / max = %d / delta = %d\n",
1017           min_save_start, max_save_start, max_save_start-min_save_start ); 
1018
1019    printf( " - SAVE_END   : min = %d / max = %d / delta = %d\n",
1020           min_save_ended, max_save_ended, max_save_ended-min_save_ended ); 
1021    fprintf( f , " - SAVE_END   : min = %d / max = %d / delta = %d\n",
1022           min_save_ended, max_save_ended, max_save_ended-min_save_ended ); 
1023
1024    printf( " - FREE_START : min = %d / max = %d / delta = %d\n",
1025           min_free_start, max_free_start, max_free_start-min_free_start ); 
1026    fprintf( f , " - FREE_START : min = %d / max = %d / delta = %d\n",
1027           min_free_start, max_free_start, max_free_start-min_free_start ); 
1028
1029    printf( " - FREE_END   : min = %d / max = %d / delta = %d\n",
1030           min_free_start, max_free_start, max_free_start-min_free_start ); 
1031    fprintf( f , " - FREE_END   : min = %d / max = %d / delta = %d\n",
1032           min_free_start, max_free_start, max_free_start-min_free_start ); 
1033
1034
1035    printf( "\n   Sequencial %d"
1036            "\n   Parallel   %d"
1037            "\n   Alloc      %d"
1038            "\n   Load       %d"
1039            "\n   Transpose  %d"
1040            "\n   Save       %d"
1041            "\n   Free       %d\n" ,
1042            SEQUENCIAL_TIME / 1000 ,
1043            PARALLEL_TIME / 1000 ,
1044            (max_aloc_ended - min_aloc_start) / 1000 ,
1045            (max_load_ended - min_load_start) / 1000 ,
1046            (max_trsp_ended - min_trsp_start) / 1000 ,
1047            (max_save_ended - min_save_start) / 1000 ,
1048            (max_free_ended - min_free_start) / 1000 );
1049
1050    fprintf( f , "\n   Sequencial %d"
1051            "\n   Parallel   %d"
1052            "\n   Alloc      %d"
1053            "\n   Load       %d"
1054            "\n   Transpose  %d"
1055            "\n   Save       %d"
1056            "\n   Free       %d\n" ,
1057            SEQUENCIAL_TIME / 1000 ,
1058            PARALLEL_TIME / 1000 ,
1059            (max_aloc_ended - min_aloc_start) / 1000 ,
1060            (max_load_ended - min_load_start) / 1000 ,
1061            (max_trsp_ended - min_trsp_start) / 1000 ,
1062            (max_save_ended - min_save_start) / 1000 ,
1063            (max_free_ended - min_free_start) / 1000 );
1064}  // end instrument()
1065
1066
1067
1068
Note: See TracBrowser for help on using the repository browser.