#include "stdio.h" //////////////////////////////////// // Image parameters #define PIXEL_SIZE 2 #define NL 1024 #define NP 1024 #define BLOCK_SIZE 1024 #define PRINTF if(lid==0) tty_printf #define TA(c,l,p) (A[c][((NP)*(l))+(p)]) #define TB(c,p,l) (B[c][((NL)*(p))+(l)]) #define TC(c,l,p) (C[c][((NP)*(l))+(p)]) #define TD(c,l,p) (D[c][((NP)*(l))+(p)]) #define TZ(c,l,p) (Z[c][((NP)*(l))+(p)]) #define max(x,y) ((x) > (y) ? (x) : (y)) #define min(x,y) ((x) < (y) ? (x) : (y)) /////////////////////////////////////////// // tricks to read parameters from ldscript /////////////////////////////////////////// struct plaf; extern struct plaf seg_heap_base; extern struct plaf NB_PROCS; extern struct plaf NB_CLUSTERS; ///////////// void main() { ////////////////////////////////// // convolution kernel parameters // The content of this section is // Philips proprietary information. /////////////////////////////////// int vnorm = 115; int vf[35]; vf[0] = 1; vf[1] = 1; vf[2] = 2; vf[3] = 2; vf[4] = 2; vf[5] = 2; vf[6] = 3; vf[7] = 3; vf[8] = 3; vf[9] = 4; vf[10] = 4; vf[11] = 4; vf[12] = 4; vf[13] = 5; vf[14] = 5; vf[15] = 5; vf[16] = 5; vf[17] = 5; vf[18] = 5; vf[19] = 5; vf[20] = 5; vf[21] = 5; vf[22] = 4; vf[23] = 4; vf[24] = 4; vf[25] = 4; vf[26] = 3; vf[27] = 3; vf[28] = 3; vf[29] = 2; vf[30] = 2; vf[31] = 2; vf[32] = 2; vf[33] = 1; vf[34] = 1; int hrange = 100; int hnorm = 201; unsigned int date = 0; int c; // cluster index for loops int l; // line index for loops int p; // pixel index for loops int x; // filter index for loops int pid = procid(); // processor id int nprocs = (int)&NB_PROCS; // number of processors per cluster int nclusters = (int)&NB_CLUSTERS; // number of clusters int lid = pid%nprocs; // local task id int cid = pid/nprocs; // cluster task id int base = (unsigned int)&seg_heap_base; // base address for shared buffers int increment = (0x80000000 / nclusters) * 2; // cluster increment int ntasks = nclusters * nprocs; // number of tasks int nblocks = (NP*NL*PIXEL_SIZE)/BLOCK_SIZE; // number of blocks per image int lines_per_task = NL/ntasks; // number of lines per task int lines_per_cluster = NL/nclusters; // number of lines per cluster int pixels_per_task = NP/ntasks; // number of columns per task int pixels_per_cluster = NP/nclusters; // number of columns per cluster int first, last; PRINTF("\n*** Processor %d entering main at cycle %d ***\n\n", pid, proctime()); ////////////////////////// // parameters checking if( (nprocs != 1) && (nprocs != 2) && (nprocs != 4) ) { PRINTF("NB_PROCS must be 1, 2 or 4\n"); while(1); } if( (nclusters != 4) && (nclusters != 8) && (nclusters != 16) && (nclusters != 32) && (nclusters != 64) && (nclusters !=128) && (nclusters != 256) ) { PRINTF("NB_CLUSTERS must be a power of 2 between 4 and 256\n"); while(1); } if( pid >= ntasks ) { PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", pid); while(1); } if ( NL % nclusters != 0 ) { PRINTF("NB_CLUSTERS must be a divider of NL"); while(1); } if( NP % nclusters != 0 ) { PRINTF("NB_CLUSTERS must be a divider of NP"); while(1); } ////////////////////////////////////////////////////////////////// // Arrays of pointers on the shared, distributed buffers // containing the images (sized for the worst case : 256 clusters) unsigned short* A[256]; int* B[256]; int* C[256]; int* D[256]; unsigned char* Z[256]; // Arrays of pointers on the instrumentation arrays // These arrays are indexed by the cluster index (sized for the worst case : 256 clusters) // each pointer points on the base adress of an array of 4 (NPROCS max) unsigned int unsigned int* LOAD_START[256]; unsigned int* LOAD_ENDED[256]; unsigned int* VERT_START[256]; unsigned int* VERT_ENDED[256]; unsigned int* HORI_START[256]; unsigned int* HORI_ENDED[256]; unsigned int* DISP_START[256]; unsigned int* DISP_ENDED[256]; // The shared, distributed buffers addresses are computed // from the seg_heap_base value defined in the ldscript file // and from the cluster increment = 4Gbytes/nclusters. // These arrays of pointers are identical and // replicated in the stack of each task for( c=0 ; cNP-1) TA(cid,l,z) == TA(cid,l,NP-1) date = proctime(); PRINTF("\n*** Starting horizontal filter at cycle %d\n", date); HORI_START[cid][lid] = date; // l = absolute line index / p = absolute pixel index // first & last define which lines are handled by a given task(cid,lid) first = (cid*nprocs + lid)*lines_per_task; last = first + lines_per_task; for ( l=first ; lNL-1) TB(cid,p,x) == TB(cid,p,NL-1) date = proctime(); PRINTF("\n*** starting vertical filter at cycle %d\n", date); VERT_START[cid][lid] = date; // l = absolute line index / p = absolute pixel index // first & last define which pixels are handled by a given task(cid,lid) first = (cid*nprocs + lid)*pixels_per_task; last = first + pixels_per_task; for ( p=first ; p>8) & 0xFF); } fb_write(NP*(cid*lines_per_cluster+l), &TZ(cid,l,0), NP); } date = proctime(); PRINTF("*** Completing display at cycle %d\n", date); DISP_ENDED[cid][lid] = date; barrier_wait(3); ///////////////////////////////////////////////////////// // Instrumentation (done by processor 0 in cluster 0) if ( pid == 0 ) { date = proctime(); PRINTF("\n*** Starting Instrumentation at cycle %d\n\n", date); int cc, pp; unsigned int min_load_start = 1000000000; unsigned int max_load_start = 0; unsigned int min_load_ended = 1000000000; unsigned int max_load_ended = 0; unsigned int min_hori_start = 1000000000; unsigned int max_hori_start = 0; unsigned int min_hori_ended = 1000000000; unsigned int max_hori_ended = 0; unsigned int min_vert_start = 1000000000; unsigned int max_vert_start = 0; unsigned int min_vert_ended = 1000000000; unsigned int max_vert_ended = 0; unsigned int min_disp_start = 1000000000; unsigned int max_disp_start = 0; unsigned int min_disp_ended = 1000000000; unsigned int max_disp_ended = 0; for ( cc=0 ; cc max_load_start ) max_load_start = LOAD_START[cc][pp]; if ( LOAD_ENDED[cc][pp] < min_load_ended ) min_load_ended = LOAD_ENDED[cc][pp]; if ( LOAD_ENDED[cc][pp] > max_load_ended ) max_load_ended = LOAD_ENDED[cc][pp]; if ( HORI_START[cc][pp] < min_hori_start ) min_hori_start = HORI_START[cc][pp]; if ( HORI_START[cc][pp] > max_hori_start ) max_hori_start = HORI_START[cc][pp]; if ( HORI_ENDED[cc][pp] < min_hori_ended ) min_hori_ended = HORI_ENDED[cc][pp]; if ( HORI_ENDED[cc][pp] > max_hori_ended ) max_hori_ended = HORI_ENDED[cc][pp]; if ( VERT_START[cc][pp] < min_vert_start ) min_vert_start = VERT_START[cc][pp]; if ( VERT_START[cc][pp] > max_vert_start ) max_vert_start = VERT_START[cc][pp]; if ( VERT_ENDED[cc][pp] < min_vert_ended ) min_vert_ended = VERT_ENDED[cc][pp]; if ( VERT_ENDED[cc][pp] > max_vert_ended ) max_vert_ended = VERT_ENDED[cc][pp]; if ( DISP_START[cc][pp] < min_disp_start ) min_disp_start = DISP_START[cc][pp]; if ( DISP_START[cc][pp] > max_disp_start ) max_disp_start = DISP_START[cc][pp]; if ( DISP_ENDED[cc][pp] < min_disp_ended ) min_disp_ended = DISP_ENDED[cc][pp]; if ( DISP_ENDED[cc][pp] > max_disp_ended ) max_disp_ended = DISP_ENDED[cc][pp]; } } PRINTF(" - LOAD_START : min = %d / max = %d / med = %d / delta = %d\n", min_load_start, max_load_start, (min_load_start+max_load_start)/2, max_load_start-min_load_start); PRINTF(" - LOAD_END : min = %d / max = %d / med = %d / delta = %d\n", min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, max_load_ended-min_load_ended); PRINTF(" - HORI_START : min = %d / max = %d / med = %d / delta = %d\n", min_hori_start, max_hori_start, (min_hori_start+max_hori_start)/2, max_hori_start-min_hori_start); PRINTF(" - HORI_END : min = %d / max = %d / med = %d / delta = %d\n", min_hori_ended, max_hori_ended, (min_hori_ended+max_hori_ended)/2, max_hori_ended-min_hori_ended); PRINTF(" - VERT_START : min = %d / max = %d / med = %d / delta = %d\n", min_vert_start, max_vert_start, (min_vert_start+max_vert_start)/2, max_vert_start-min_vert_start); PRINTF(" - VERT_END : min = %d / max = %d / med = %d / delta = %d\n", min_vert_ended, max_vert_ended, (min_vert_ended+max_vert_ended)/2, max_vert_ended-min_vert_ended); PRINTF(" - DISP_START : min = %d / max = %d / med = %d / delta = %d\n", min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, max_disp_start-min_disp_start); PRINTF(" - DISP_END : min = %d / max = %d / med = %d / delta = %d\n", min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, max_disp_ended-min_disp_ended); PRINTF(" - BARRIER LOAD/HORI = %d\n", min_hori_start - max_load_ended); PRINTF(" - BARRIER HORI/VERT = %d\n", min_vert_start - max_hori_ended); PRINTF(" - BARRIER VERT/DISP = %d\n", min_disp_start - max_vert_ended); PRINTF(" - LOAD = %d\n", max_load_ended); PRINTF(" - FILTER = %d\n", max_vert_ended - max_load_ended); PRINTF(" - DISPLAY = %d\n", max_disp_ended - max_vert_ended); PRINTF("\nBEGIN LOAD_START\n"); for ( cc=0 ; cc