/////////////////////////////////////////////////////////////////////////////////////// // File : convol.c // Date : june 2014 // author : Alain Greiner /////////////////////////////////////////////////////////////////////////////////////// // This multi-threaded application implements a 2D convolution product. // It can run on a multi-processors, multi-clusters architecture, with one thread // per processor, and uses the POSIX threads API. // // The main() function can be launched on any processor P[x,y,l]. // It makes the initialisations, launch (N-1) threads to run the execute() function // on the (N-1) other processors than P[x,y,l], call himself the execute() function, // and finally call the instrument() function to display instrumentation results // when the parallel execution is completed. // // The convolution kernel is [201]*[35] pixels, but it can be factored in two // independant line and column convolution products. // The five buffers containing the image are distributed in clusters. // // The (1024 * 1024) pixels image is read from a file (2 bytes per pixel). // // - number of clusters containing processors must be power of 2 no larger than 256. // - number of processors per cluster must be power of 2 no larger than 8. /////////////////////////////////////////////////////////////////////////////////////// #include #include #include #include #include #include #include #define IMAGE_IN_PATH "misc/philips_1024.raw" #define USE_SQT_BARRIER 1 #define VERBOSE 1 #define SUPER_VERBOSE 0 #define USE_DQT_BARRIER 1 #define X_MAX 16 #define Y_MAX 16 #define PROCS_MAX 4 #define CLUSTERS_MAX (X_MAX * Y_MAX) #define THREADS_MAX (X_MAX * Y_MAX * PROCS_MAX] #define INITIAL_DISPLAY_ENABLE 1 #define FINAL_DISPLAY_ENABLE 1 #define PIXEL_SIZE 2 // input image has 2 bytes per pixel #define FBF_TYPE 420 // output image has 1 byte per pixel #define NL 1024 #define NP 1024 #define NB_PIXELS (NP * NL) #define FRAME_SIZE (NB_PIXELS * PIXEL_SIZE) #define TA(c,l,p) (A[c][((NP) * (l)) + (p)]) #define TB(c,p,l) (B[c][((NL) * (p)) + (l)]) #define TC(c,l,p) (C[c][((NP) * (l)) + (p)]) #define TD(c,l,p) (D[c][((NP) * (l)) + (p)]) #define TZ(c,l,p) (Z[c][((NP) * (l)) + (p)]) #define max(x,y) ((x) > (y) ? (x) : (y)) #define min(x,y) ((x) < (y) ? (x) : (y)) ////////////////////////////////////////////////////////// // global variables stored in seg_data in cluster[0,0] ////////////////////////////////////////////////////////// // Instrumentation counters (cluster_id, lpid] unsigned int START[CLUSTERS_MAX][PROCS_MAX]; unsigned int H_BEG[CLUSTERS_MAX][PROCS_MAX]; unsigned int H_END[CLUSTERS_MAX][PROCS_MAX]; unsigned int V_BEG[CLUSTERS_MAX][PROCS_MAX]; unsigned int V_END[CLUSTERS_MAX][PROCS_MAX]; unsigned int D_BEG[CLUSTERS_MAX][PROCS_MAX]; unsigned int D_END[CLUSTERS_MAX][PROCS_MAX]; // file pointers on input image FILE * f_image_in; FILE * f_instrum; // return values at thread exit unsigned int THREAD_EXIT_SUCCESS = 0; unsigned int THREAD_EXIT_FAILURE = 1; // synchronization barrier pthread_barrier_t barrier; // coordinates of core executing the main thread unsigned int cxy_main; unsigned int lid_main; // arrays of pointers on distributed buffers in all clusters unsigned short * GA[CLUSTERS_MAX]; int * GB[CLUSTERS_MAX]; int * GC[CLUSTERS_MAX]; int * GD[CLUSTERS_MAX]; unsigned char * GZ[CLUSTERS_MAX]; // trdid[] array for execution threads // 1D array if no explicit threads placement / 2D array if explicit placement pthread_t trdid[CLUSTERS_MAX][PROCS_MAX]; //pthread_t trdid[THREADS_MAX]; // attr[] array for execution threads // unused if no explicit threads placement pthread_attr_t attr[CLUSTERS_MAX][PROCS_MAX]; ///////////////////////////////////////////////////////////////////////////////////// // functions declaration ///////////////////////////////////////////////////////////////////////////////////// void execute( void ); void instrument( unsigned int nclusters, unsigned int ncores ); ///////////////// void main( void ) { unsigned int x_size; // number of clusters in a row unsigned int y_size; // number of clusters in a column unsigned int ncores; // number of processors per cluster unsigned long long date; char name[64]; // instrumentation file name char path[128]; // instrumentation path name int error; // get platform parameters if ( get_config( &x_size , &y_size , &ncores ) ) { printf("\n[convol error] cannot get hardware configuration\n"); exit( 0 ); } // get core executing this main thread // and register these coordinates in global variables get_core_id( &cxy_main , &lid_main ); // check ncores if( (ncores != 1) && (ncores != 2) && (ncores != 4) ) { printf("\n[convol error] number of cores per cluster must be 1/2/4\n"); exit( 0 ); } // check x_size if( (x_size != 1) && (x_size != 2) && (x_size != 4) && (x_size != 8) && (x_size != 16) ) { printf("\n[convol error] x_size must be 1/2/4/8/16\n"); exit( 0 ); } // check y_size if( (y_size != 1) && (y_size != 2) && (y_size != 4) && (y_size != 8) && (y_size != 16) ) { printf("\n[convol error] y_size must be 1/2/4/8/16\n"); exit( 0 ); } // compute nthreads and nclusters unsigned int nthreads = x_size * y_size * ncores; unsigned int nclusters = x_size * y_size; get_cycle( &date ); printf("\n[convol] starts on core[%x,%d] / %d thread(s) / cycle %d\n", cxy_main, lid_main, nthreads, (unsigned int)date ); // build instrumentation file name if( USE_DQT_BARRIER ) snprintf( name , 64 , "p_convol_dqt_%d_%d", x_size * y_size , ncores ); else snprintf( name , 64 , "p_convol_smp_%d_%d", x_size * y_size , ncores ); // build pathname snprintf( path , 128 , "/home/%s", name ); // open instrumentation file f_instrum = fopen( path , NULL ); if ( f_instrum == NULL ) { printf("\n[convol error] cannot open instrumentation file <%s>\n", path ); exit( 0 ); } #if DEBUG_MAIN get_cycle( &date ); printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n", cxy_main, lid_main, path, (unsigned int)date ); #endif // open input file f_image_in = fopen( IMAGE_IN_PATH , NULL ); if ( f_image_in == NULL ) { printf("\n[convol error] cannot open input file <%s>\n", IMAGE_IN_PATH ); exit( 0 ); } #if DEBUG_MAIN get_cycle( &date ); printf("\n[convol] main on core[%x,%d] open file <%s> at cycle %d\n", cxy_main, lid_main, path, (unsigned int)date ); #endif // get FBF config unsigned int fbf_width; unsigned int fbf_height; unsigned int fbf_type; fbf_get_config( &fbf_width , &fbf_height , &fbf_type ); // check FBF size if ( (fbf_width != NP) || (fbf_height != NL) ) { printf("\n[convol error] bad FBF size\n"); exit( 0 ); } // check FBF subsampling if ( fbf_type != FBF_TYPE ) { printf("\n[convol error] bad FBF subsampling\n"); exit( 0 ); } // initialise barrier if( USE_DQT_BARRIER ) { pthread_barrierattr_t attr; attr.x_size = x_size; attr.y_size = y_size; attr.nthreads = ncores; error = pthread_barrier_init( &barrier, &attr , nthreads ); } else { error = pthread_barrier_init( &barrier, NULL , nthreads ); } if( error ) { printf("\n[convol error] cannot initialize barrier\n"); exit( 0 ); } get_cycle( &date ); printf("\n[convol] main on core[%x,%d] completes initialisation at cycle %d\n" "- CLUSTERS = %d\n" "- PROCS = %d\n" "- THREADS = %d\n", cxy_main, lid_main, (unsigned int)date, nclusters, ncores, nthreads ); // launch exec threads with explicit placement unsigned int x; unsigned int y; unsigned int l; unsigned int cxy; for( x = 0 ; x < x_size ; x++ ) { for( y = 0 ; y < y_size ; y++ ) { cxy = HAL_CXY_FROM_XY(x,y); for( l = 0 ; l < ncores ; l++ ) { // no other thread on the core running the main if( (cxy != cxy_main) || (l != lid_main) ) { // define thread attributes attr[cxy][l].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED; attr[cxy][l].cxy = cxy; attr[cxy][l].lid = l; // create thread on core[x,y,l] if (pthread_create( &trdid[cxy][l], &attr[cxy][l], &execute, NULL ) ) // execute has no argument { printf("\n[convol error] created thread %x on core[%x][%d]\n", trdid[cxy][l] , cxy , l ); exit( 0 ); } } } } } /* // launch other threads without explicit placement for ( n = 1 ; n < nthreads ; n++ ) { if ( giet_pthread_create( &trdid[n], NULL, // no attribute &execute, NULL ) ) // no argument { printf("\n[convol error] creating thread %x\n", trdid[n] ); exit( 0 ); } } */ // the main thread run itself the execute() function execute(); // wait other threads completions if explicit threads placement for( x = 0 ; x < x_size ; x++ ) { for( y = 0 ; y < y_size ; y++ ) { unsigned int cxy = HAL_CXY_FROM_XY(x,y); for( l = 0 ; l < ncores ; l++ ) { // no other thread on the core running the main if( (cxy != cxy_main) || (l != lid_main) ) { unsigned int * exit_status; // wait thread running on core[x,y,l] if (pthread_join( trdid[cxy][l] , (void*)(&exit_status) ) ) { printf("\n[convol error] main cannot join thread[%x,%d]\n", cxy, l ); exit( 0 ); } // check exit_status if( *exit_status != 0 ) { printf("\n[convol error] thread[%x,%d]return failure\n", cxy, l ); exit( 0 ); } } } } } /* // wait other threads completion when no explicit threads placement for ( n = 1 ; n < nthreads ; n++ ) { if ( pthread_join( trdid[n], NULL ) ) { printf("\n[convol error] joining thread %x\n", trdid[n] ); exit( 0 ); } } */ // call the instrument() function instrument( nclusters , ncores ); exit( 0 ); } // end main() ////////////// void execute() { unsigned long long date; // Each thread[x,y,p] initialises the convolution kernel parameters in local stack. // The values defined in the next 12 lines are Philips proprietary information. int vnorm = 115; int vf[35] = { 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2, 1, 1 }; unsigned int hrange = 100; unsigned int hnorm = 201; // get plat-form config unsigned int x_size; // number of clusters in a row unsigned int y_size; // number of clusters in a column unsigned int ncores; // number of processors per cluster get_config( &x_size , &y_size , &ncores ); // get cluster indentifier and core local index unsigned int cxy; unsigned int lid; get_core_id( &cxy , &lid ); unsigned int x = HAL_X_FROM_CXY( cxy ); unsigned int y = HAL_Y_FROM_CXY( cxy ); // indexes for loops unsigned int c; // cluster index unsigned int l; // line index unsigned int p; // pixel index unsigned int z; // vertical filter index unsigned int nclusters = x_size * y_size; // number of clusters unsigned int cluster_id = (x * y_size) + y; // continuous cluster index unsigned int thread_id = (cluster_id * ncores) + lid; // continuous thread index unsigned int nthreads = nclusters * ncores; // number of threads unsigned int frame_size = FRAME_SIZE; // total size (bytes) unsigned int lines_per_thread = NL / nthreads; // lines per thread unsigned int lines_per_cluster = NL / nclusters; // lines per cluster unsigned int pixels_per_thread = NP / nthreads; // columns per thread unsigned int pixels_per_cluster = NP / nclusters; // columns per cluster unsigned int first, last; get_cycle( &date ); START[cluster_id][lid] = (unsigned int)date; // Each thread[cxy][0] allocate the global buffers in cluster cxy if ( lid == 0 ) { #if VERBOSE printf( "\n[convol] thread[%x,%d] enters malloc at cycle %d\n", cxy , lid , (unsigned int)date ); #endif GA[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters) , cxy ); GB[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy ); GC[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy ); GD[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)*2 , cxy ); GZ[cluster_id] = remote_malloc( (FRAME_SIZE/nclusters)/2 , cxy ); #if VERBOSE printf( "\n[convol] Shared Buffer Virtual Addresses in cluster %x\n" "### GA = %x\n" "### GB = %x\n" "### GC = %x\n" "### GD = %x\n" "### GZ = %x\n", cxy, GA[cluster_id], GB[cluster_id], GC[cluster_id], GD[cluster_id], GZ[cluster_id] ); #endif } //////////////////////////////// pthread_barrier_wait( &barrier ); // Each thread[cxy,p] initialise in its private stack a copy of the // arrays of pointers on the shared, distributed buffers. unsigned short * A[CLUSTERS_MAX]; int * B[CLUSTERS_MAX]; int * C[CLUSTERS_MAX]; int * D[CLUSTERS_MAX]; unsigned char * Z[CLUSTERS_MAX]; for( c = 0 ; c < nclusters ; c++ ) { A[c] = GA[c]; B[c] = GB[c]; C[c] = GC[c]; D[c] = GD[c]; Z[c] = GZ[c]; } // Each thread[x,y,0] access the file containing the input image, to load // the local A[c] buffer (frame_size / nclusters loaded in each cluster). // Other threads are waiting on the barrier. if ( lid==0 ) { unsigned int offset = (frame_size/nclusters)*cluster_id; unsigned int size = frame_size/nclusters; // seek the pointer in file if ( fseek( f_image_in, offset, SEEK_SET ) ) { printf("\n[convol error] in %s : thread[%x,%d] cannot seek input file\n", __FUNCTION__ , cxy , lid ); pthread_exit( &THREAD_EXIT_FAILURE ); } if ( fread( A[cluster_id], 1, size, f_image_in ) != size ) { printf("\n[convol error] in %s : thread[%x,%d] cannot read input file\n", __FUNCTION__ , cxy , lid ); pthread_exit( &THREAD_EXIT_FAILURE ); } #if VERBOSE get_cycle( &date ); printf( "\n[convol] thread[%x,%d] load input file at cycle %d\n", cxy , lid , (unsigned int)date ); #endif } // Optionnal parallel display of the initial image stored in A[c] buffers. // Eah thread[x,y,p] displays (NL/nthreads) lines. (one byte per pixel). if ( INITIAL_DISPLAY_ENABLE ) { unsigned int line; unsigned int offset = lines_per_thread * lid; for ( l = 0 ; l < lines_per_thread ; l++ ) { line = offset + l; for ( p = 0 ; p < NP ; p++ ) { TZ(cluster_id, line, p) = (unsigned char)(TA(cluster_id, line, p) >> 8); } if (fbf_write( &TZ(cluster_id, line, 0), // first pixel in TZ NP, // number of bytes NP*(l + (thread_id * lines_per_thread)))) // offset in FBF { printf("\n[convol error] in %s : thread[%x,%d] cannot access FBF\n", __FUNCTION__ , cxy , lid ); pthread_exit( &THREAD_EXIT_FAILURE ); } } #if VERBOSE get_cycle( &date ); printf( "\n[convol] thread[%x,%d] completes initial display at cycle %d\n", cxy , lid , (unsigned int)date ); #endif //////////////////////////////// pthread_barrier_wait( &barrier ); } //////////////////////////////////////////////////////////// // parallel horizontal filter : // B <= transpose(FH(A)) // D <= A - FH(A) // Each thread computes (NL/nthreads) lines // The image must be extended : // if (z<0) TA(cluster_id,l,z) == TA(cluster_id,l,0) // if (z>NP-1) TA(cluster_id,l,z) == TA(cluster_id,l,NP-1) //////////////////////////////////////////////////////////// get_cycle( &date ); H_BEG[cluster_id][lid] = (unsigned int)date; #if VERBOSE printf( "\n[convol] thread[%x,%d] starts horizontal filter at cycle %d\n", cxy , lid , (unsigned int)date ); #else if ( (cxy == cxy_main) && (lid == lid_main) ) printf( "\n[convol] thread[%x,%d] starts horizontal filter at cycle %d\n", cxy , lid , (unsigned int)date ); #endif // l = absolute line index / p = absolute pixel index // first & last define which lines are handled by a given thread first = thread_id * lines_per_thread; last = first + lines_per_thread; for (l = first; l < last; l++) { // src_c and src_l are the cluster index and the line index for A & D int src_c = l / lines_per_cluster; int src_l = l % lines_per_cluster; // We use the specific values of the horizontal ep-filter for optimisation: // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1] // To minimize the number of tests, the loop on pixels is split in three domains int sum_p = (hrange + 2) * TA(src_c, src_l, 0); for (z = 1; z < hrange; z++) { sum_p = sum_p + TA(src_c, src_l, z); } // first domain : from 0 to hrange for (p = 0; p < hrange + 1; p++) { // dst_c and dst_p are the cluster index and the pixel index for B int dst_c = p / pixels_per_cluster; int dst_p = p % pixels_per_cluster; sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, 0); TB(dst_c, dst_p, l) = sum_p / hnorm; TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; } // second domain : from (hrange+1) to (NP-hrange-1) for (p = hrange + 1; p < NP - hrange; p++) { // dst_c and dst_p are the cluster index and the pixel index for B int dst_c = p / pixels_per_cluster; int dst_p = p % pixels_per_cluster; sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, p - hrange - 1); TB(dst_c, dst_p, l) = sum_p / hnorm; TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; } // third domain : from (NP-hrange) to (NP-1) for (p = NP - hrange; p < NP; p++) { // dst_c and dst_p are the cluster index and the pixel index for B int dst_c = p / pixels_per_cluster; int dst_p = p % pixels_per_cluster; sum_p = sum_p + (int) TA(src_c, src_l, NP - 1) - (int) TA(src_c, src_l, p - hrange - 1); TB(dst_c, dst_p, l) = sum_p / hnorm; TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; } #if SUPER_VERBOSE get_cycle( &date ); printf(" - line %d computed at cycle %d\n", l, (unsigned int)date ); #endif } get_cycle( &date ); H_END[cluster_id][lid] = (unsigned int)date; #if VERBOSE printf( "\n[convol] thread[%x,%d] completes horizontal filter at cycle %d\n", cxy , lid, (unsigned int)date ); #else if ( (cxy == cxy_main) && (lid == lid_main) ) printf( "\n[convol] thread[%x,%d] completes horizontal filter at cycle %d\n", cxy , lid, (unsigned int)date ); #endif //////////////////////////////// pthread_barrier_wait( &barrier ); /////////////////////////////////////////////////////////////// // parallel vertical filter : // C <= transpose(FV(B)) // Each thread computes (NP/nthreads) columns // The image must be extended : // if (l<0) TB(cluster_id,p,l) == TB(cluster_id,p,0) // if (l>NL-1) TB(cluster_id,p,l) == TB(cluster_id,p,NL-1) /////////////////////////////////////////////////////////////// get_cycle( &date ); V_BEG[cluster_id][lid] = (unsigned int)date; #if VERBOSE printf( "\n[convol] thread[%x,%d] starts vertical filter at cycle %d\n", cxy , lid , (unsigned int)date ); #else if ( (cxy == cxy_main) && (lid == lid_main) ) printf( "\n[convol] thread[%x,%d] starts vertical filter at cycle %d\n", cxy , lid, (unsigned int)date ); #endif // l = absolute line index / p = absolute pixel index // first & last define which pixels are handled by a given thread first = thread_id * pixels_per_thread; last = first + pixels_per_thread; for (p = first; p < last; p++) { // src_c and src_p are the cluster index and the pixel index for B int src_c = p / pixels_per_cluster; int src_p = p % pixels_per_cluster; int sum_l; // We use the specific values of the vertical ep-filter // To minimize the number of tests, the NL lines are split in three domains // first domain : explicit computation for the first 18 values for (l = 0; l < 18; l++) { // dst_c and dst_l are the cluster index and the line index for C int dst_c = l / lines_per_cluster; int dst_l = l % lines_per_cluster; for (z = 0, sum_l = 0; z < 35; z++) { sum_l = sum_l + vf[z] * TB(src_c, src_p, max(l - 17 + z,0) ); } TC(dst_c, dst_l, p) = sum_l / vnorm; } // second domain for (l = 18; l < NL - 17; l++) { // dst_c and dst_l are the cluster index and the line index for C int dst_c = l / lines_per_cluster; int dst_l = l % lines_per_cluster; sum_l = sum_l + TB(src_c, src_p, l + 4) + TB(src_c, src_p, l + 8) + TB(src_c, src_p, l + 11) + TB(src_c, src_p, l + 15) + TB(src_c, src_p, l + 17) - TB(src_c, src_p, l - 5) - TB(src_c, src_p, l - 9) - TB(src_c, src_p, l - 12) - TB(src_c, src_p, l - 16) - TB(src_c, src_p, l - 18); TC(dst_c, dst_l, p) = sum_l / vnorm; } // third domain for (l = NL - 17; l < NL; l++) { // dst_c and dst_l are the cluster index and the line index for C int dst_c = l / lines_per_cluster; int dst_l = l % lines_per_cluster; sum_l = sum_l + TB(src_c, src_p, min(l + 4, NL - 1)) + TB(src_c, src_p, min(l + 8, NL - 1)) + TB(src_c, src_p, min(l + 11, NL - 1)) + TB(src_c, src_p, min(l + 15, NL - 1)) + TB(src_c, src_p, min(l + 17, NL - 1)) - TB(src_c, src_p, l - 5) - TB(src_c, src_p, l - 9) - TB(src_c, src_p, l - 12) - TB(src_c, src_p, l - 16) - TB(src_c, src_p, l - 18); TC(dst_c, dst_l, p) = sum_l / vnorm; } #if SUPER_VERBOSE get_cycle( &date ); printf(" - column %d computed at cycle %d\n", p, (unsigned int)date ); #endif } get_cycle( &date ); V_END[cluster_id][lid] = (unsigned int)date; #if VERBOSE printf( "\n[convol] thread[%x,%d] completes vertical filter at cycle %d\n", cxy , lid , (unsigned int)date ); #else if ( (cxy == cxy_main) && (lid == lid_main) ) printf( "\n[convol] thread[%x,%d] completes vertical filter at cycle %d\n", cxy , lid, (unsigned int)date ); #endif //////////////////////////////// pthread_barrier_wait( &barrier ); // Optional parallel display of the final image Z <= D + C // Eah thread[x,y,p] displays (NL/nthreads) lines. (one byte per pixel). if ( FINAL_DISPLAY_ENABLE ) { get_cycle( &date ); D_BEG[cluster_id][lid] = (unsigned int)date; #if VERBOSE printf( "\n[convol] thread[%x,%d] starts final display at cycle %d\n", cxy , lid , (unsigned int)date ); #else if ( (cxy == cxy_main) && (lid == lid_main) ) printf( "\n[convol] thread[%x,%d] starts final display at cycle %d\n", cxy , lid, (unsigned int)date ); #endif unsigned int line; unsigned int offset = lines_per_thread * lid; for ( l = 0 ; l < lines_per_thread ; l++ ) { line = offset + l; for ( p = 0 ; p < NP ; p++ ) { TZ(cluster_id, line, p) = (unsigned char)( (TD(cluster_id, line, p) + TC(cluster_id, line, p) ) >> 8 ); } if (fbf_write( &TZ(cluster_id, line, 0), // first pixel in TZ NP, // number of bytes NP*(l + (thread_id * lines_per_thread)))) // offset in FBF { printf("\n[convol error] in %s : thread[%d,%d,%d] cannot access FBF\n", __FUNCTION__ , x , y , lid ); pthread_exit( &THREAD_EXIT_FAILURE ); } } get_cycle( &date ); D_END[cluster_id][lid] = (unsigned int)date; #if VERBOSE printf( "\n[convol] thread[%x,%d] completes final display at cycle %d\n", cxy , lid , (unsigned int)date ); #else if ( (cxy == cxy_main) && (lid == lid_main) ) printf( "\n[convol] thread[%x,%d] completes final display at cycle %d\n", cxy , lid , (unsigned int)date ); #endif //////////////////////////////// pthread_barrier_wait( &barrier ); } // all threads (but the one executing main) exit if ( (cxy != cxy_main) || (lid != lid_main) ) { pthread_exit( &THREAD_EXIT_SUCCESS ); } } // end execute() ///////////////////////////////////////// void instrument( unsigned int nclusters, unsigned int ncores ) { unsigned int cc, pp; unsigned int min_start = 0xFFFFFFFF; unsigned int max_start = 0; unsigned int min_h_beg = 0xFFFFFFFF; unsigned int max_h_beg = 0; unsigned int min_h_end = 0xFFFFFFFF; unsigned int max_h_end = 0; unsigned int min_v_beg = 0xFFFFFFFF; unsigned int max_v_beg = 0; unsigned int min_v_end = 0xFFFFFFFF; unsigned int max_v_end = 0; unsigned int min_d_beg = 0xFFFFFFFF; unsigned int max_d_beg = 0; unsigned int min_d_end = 0xFFFFFFFF; unsigned int max_d_end = 0; for (cc = 0; cc < nclusters; cc++) { for (pp = 0; pp < ncores; pp++ ) { if (START[cc][pp] < min_start) min_start = START[cc][pp]; if (START[cc][pp] > max_start) max_start = START[cc][pp]; if (H_BEG[cc][pp] < min_h_beg) min_h_beg = H_BEG[cc][pp]; if (H_BEG[cc][pp] > max_h_beg) max_h_beg = H_BEG[cc][pp]; if (H_END[cc][pp] < min_h_end) min_h_end = H_END[cc][pp]; if (H_END[cc][pp] > max_h_end) max_h_end = H_END[cc][pp]; if (V_BEG[cc][pp] < min_v_beg) min_v_beg = V_BEG[cc][pp]; if (V_BEG[cc][pp] > max_v_beg) max_v_beg = V_BEG[cc][pp]; if (V_END[cc][pp] < min_v_end) min_v_end = V_END[cc][pp]; if (V_END[cc][pp] > max_v_end) max_v_end = V_END[cc][pp]; if (D_BEG[cc][pp] < min_d_beg) min_d_beg = D_BEG[cc][pp]; if (D_BEG[cc][pp] > max_d_beg) max_d_beg = D_BEG[cc][pp]; if (D_END[cc][pp] < min_d_end) min_d_end = D_END[cc][pp]; if (D_END[cc][pp] > max_d_end) max_d_end = D_END[cc][pp]; } } printf(" - START : min = %d / max = %d / med = %d / delta = %d\n", min_start, max_start, (min_start+max_start)/2, max_start-min_start); printf(" - H_BEG : min = %d / max = %d / med = %d / delta = %d\n", min_h_beg, max_h_beg, (min_h_beg+max_h_beg)/2, max_h_beg-min_h_beg); printf(" - H_END : min = %d / max = %d / med = %d / delta = %d\n", min_h_end, max_h_end, (min_h_end+max_h_end)/2, max_h_end-min_h_end); printf(" - V_BEG : min = %d / max = %d / med = %d / delta = %d\n", min_v_beg, max_v_beg, (min_v_beg+max_v_beg)/2, max_v_beg-min_v_beg); printf(" - V_END : min = %d / max = %d / med = %d / delta = %d\n", min_v_end, max_v_end, (min_v_end+max_v_end)/2, max_v_end-min_v_end); printf(" - D_BEG : min = %d / max = %d / med = %d / delta = %d\n", min_d_beg, max_d_beg, (min_d_beg+max_d_beg)/2, max_d_beg-min_d_beg); printf(" - D_END : min = %d / max = %d / med = %d / delta = %d\n", min_d_end, max_d_end, (min_d_end+max_d_end)/2, max_d_end-min_d_end); printf( "\n General Scenario (Kcycles for each step)\n" ); printf( " - BOOT OS = %d\n", (min_start )/1000 ); printf( " - LOAD IMAGE = %d\n", (min_h_beg - min_start)/1000 ); printf( " - H_FILTER = %d\n", (max_h_end - min_h_beg)/1000 ); printf( " - BARRIER HORI/VERT = %d\n", (min_v_beg - max_h_end)/1000 ); printf( " - V_FILTER = %d\n", (max_v_end - min_v_beg)/1000 ); printf( " - BARRIER VERT/DISP = %d\n", (min_d_beg - max_v_end)/1000 ); printf( " - DISPLAY = %d\n", (max_d_end - min_d_beg)/1000 ); // TODO save these results on f_instrum } // end instrument() // Local Variables: // tab-width: 3 // c-basic-offset: 3 // c-file-offsets:((innamespace . 0)(inline-open . 0)) // indent-tabs-mode: nil // End: // vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3