#include "stdio.h" //////////////////////////////////// // Image parameters #define PIXEL_SIZE 2 #define NL 1024 #define NP 1024 #define BLOCK_SIZE 1512 #define PRINTF if(lid==0) tty_printf #define TA(c,l,p) (A[c][((NP)*(l))+(p)]) #define TB(c,p,l) (B[c][((NL)*(p))+(l)]) #define TC(c,l,p) (C[c][((NP)*(l))+(p)]) #define TD(c,l,p) (D[c][((NP)*(l))+(p)]) #define max(x,y) ((x) > (y) ? (x) : (y)) #define min(x,y) ((x) < (y) ? (x) : (y)) /////////////////////////////////////////// // tricks to read parameters from ldscript /////////////////////////////////////////// struct plaf; extern struct plaf seg_heap_base; extern struct plaf NB_PROCS; extern struct plaf NB_CLUSTERS; ///////////// void main() { ////////////////////////////////// // convolution kernel parameters // The content of this section is // Philips proprietary information. /////////////////////////////////// int vrange = 17; int vnorm = 115; int vf[35]; vf[0] = 1; vf[1] = 1; vf[2] = 2; vf[3] = 2; vf[4] = 2; vf[5] = 2; vf[6] = 3; vf[7] = 3; vf[8] = 3; vf[9] = 4; vf[10] = 4; vf[11] = 4; vf[12] = 4; vf[13] = 5; vf[14] = 5; vf[15] = 5; vf[16] = 5; vf[17] = 5; vf[18] = 5; vf[19] = 5; vf[20] = 5; vf[21] = 5; vf[22] = 4; vf[23] = 4; vf[24] = 4; vf[25] = 4; vf[26] = 3; vf[27] = 3; vf[28] = 3; vf[29] = 2; vf[30] = 2; vf[31] = 2; vf[32] = 2; vf[33] = 1; vf[34] = 1; int hrange = 100; int hnorm = 201; unsigned int date = 0; unsigned int delta = 0; int c; // cluster index for loops int l; // line index for loops int p; // pixel index for loops int x; // filter index for loops int pid = procid(); // processor id int nprocs = (unsigned int)&NB_PROCS; // number of processors per cluster int nclusters = (unsigned int)&NB_CLUSTERS; // number of clusters int lid = pid%nprocs; // local processor id int cid = pid/nprocs; // local processor id int base = (unsigned int)&seg_heap_base; // base address for shared buffers int increment = (0x80000000 / nclusters) * 2; // cluster increment int ntasks = nclusters * nprocs; // number of tasks int nblocks = (NP*NL*PIXEL_SIZE)/BLOCK_SIZE; // number of blocks per image int lines_per_task = NL/ntasks; // number of lines per task int lines_per_cluster = NL/nclusters; // number of lines per cluster int columns_per_task = NP/ntasks; // number of columns per task int columns_per_cluster = NP/nclusters; // number of columns per cluster PRINTF("\n *** Processor %d entering main at cycle %d ***\n\n", pid, proctime()); ////////////////////////// // parameters checking if( (nprocs != 1) && (nprocs != 2) && (nprocs != 4) ) { PRINTF("NB_PROCS must be 1, 2 or 4\n"); while(1); } if( (nclusters != 4) && (nclusters != 8) && (nclusters != 16) && (nclusters != 32) && (nclusters != 64) && (nclusters !=128) && (nclusters != 256) ) { PRINTF("NB_CLUSTERS must be a power of 2 between 4 and 256\n"); while(1); } if( pid >= ntasks ) { PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", pid); while(1); } if ( NL % nclusters != 0 ) { PRINTF("NB_CLUSTERS must be a divider of NL"); while(1); } if( NP % nclusters != 0 ) { PRINTF("NB_CLUSTERS must be a divider of NP"); while(1); } ////////////////////////////////////////////////////////////////// // Arrays of pointers on the shared, distributed buffers // containing the images (sized for the worst case : 256 clusters) unsigned short* A[256]; int* B[256]; int* C[256]; int* D[256]; // The shared, distributed buffers addresses are computed // from the seg_heap_base value defined in the ldscript file // and from the cluster increment = 4Gbytes/nclusters. // These arrays of pointers are identical and // replicated in the stack of each task for( c=0 ; cNP-1) TA(cid,l,p) == TA(cid,l,NL-1) // We use the spécific values of the horizontal ep-filter for optimisation: // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1] // To minimize the number of tests, the loop on pixels is split in three domains int sum = (hrange+2)*TA(cid,l,0); for ( x = 1 ; x < hrange ; x++) sum = sum + TA(cid,l,x); // first domain : from 0 to hrange for ( p = 0 ; p < hrange+1 ; p++) { sum = sum + TA(cid,l,p+hrange) - TA(cid,l,0); TB((p/columns_per_cluster),(p%columns_per_cluster),(cid*lines_per_cluster+l)) = sum/hnorm; TD(cid,l,p) = TA(cid,l,p) - sum/hnorm; } // second domain : from (hrange+1) to (NP-hrange-1) for ( p = hrange+1 ; p < NP-hrange ; p++) { sum = sum + TA(cid,l,p+hrange) - TA(cid,l,p-hrange-1); TB((p/columns_per_cluster),(p%columns_per_cluster),(cid*lines_per_cluster+l)) = sum/hnorm; TD(cid,l,p) = TA(cid,l,p) - sum/hnorm; } // third domain : from (NP-hrange) to (NP-1) for ( p = NP-hrange ; p < NP ; p++) { sum = sum + TA(cid,l,NP-1) - TA(cid,l,p-hrange-1); TB((p/columns_per_cluster),(p%columns_per_cluster),(cid*lines_per_cluster+l)) = sum/hnorm; TD(cid,l,p) = TA(cid,l,p) - sum/hnorm; } PRINTF(" - line %d computed at cycle %d\n", l, proctime()); } delta = proctime() - date; date = date + delta; PRINTF(" *** completing horizontal filter at cycle %d (%d)\n", date, delta); barrier_wait(1); ////////////////////////////////////////////////////////// // parallel vertical filter : // C <= transpose(FV(B)) // each processor computes (NP/ntasks) columns delta = proctime() - date; date = date + delta; PRINTF("\n *** starting vertical filter at cycle %d (%d)\n", date, delta); // l = line index / p = column index in the cluster for ( p = columns_per_task*lid ; p < columns_per_task*(lid+1) ; p++) { unsigned int sum = 0; // The image must be extended : // if (l<0) TB(cid,p,x) == TB(cid,p,0) // if (l>NL-1) TB(cid,p,x) == TB(cid,p,NL-1) // We use the spécific values of the vertical ep-filter // To minimize the number of tests, the NL lines are split in three domains // first domain for ( l = 0 ; l < vrange ; l++) { for ( x = 0 ; x < (2*vrange+1) ; x++ ) { sum = sum + vf[x] * TB(cid,p,max(l-vrange+x,0)); } TC((l/lines_per_cluster),(l%lines_per_cluster),(cid*columns_per_cluster+p)) = sum/vnorm; } // second domain for ( l = vrange ; l < NL-vrange ; l++ ) { sum = sum + TB(cid,p,l+4) + TB(cid,p,l+8) + TB(cid,p,l+11) + TB(cid,p,l+15) + TB(cid,p,l+17) - TB(cid,p,l-5) - TB(cid,p,l-9) - TB(cid,p,l-12) - TB(cid,p,l-16) - TB(cid,p,max(l-18,0)); TC((l/lines_per_cluster),(l%lines_per_cluster),(cid*columns_per_cluster+p)) = sum/vnorm; } // third domain for ( l = NL-vrange ; l < NL ; l++ ) { sum = sum + TB(cid,p,min(l+5,NL-1)) + TB(cid,p,min(l+9,NL-1)) + TB(cid,p,min(l+12,NL-1)) + TB(cid,p,min(l+16,NL-1)) + TB(cid,p,min(l+18,NL-1)) - TB(cid,p,l-4) - TB(cid,p,l-8) - TB(cid,p,l-11) - TB(cid,p,l-15) - TB(cid,p,l-17); TC((l/lines_per_cluster),(l%lines_per_cluster),(cid*columns_per_cluster+p)) = sum/vnorm; } PRINTF(" - column %d computed at cycle %d\n", p, proctime()); } delta = proctime() - date; date = date + delta; PRINTF(" *** completing vertical filter at cycle %d (%d)\n", date, delta); barrier_wait(2); //////////////////////////////////////////////////////////////////////////// // final computation and parallel display using the distributed DMA // D <= D + C // Each processor use its private DMA channel to display // the resulting image, line per line (one byte per pixel). // Eah processor computes & displays (NL/ntasks) lines. delta = proctime() - date; date = date + delta; PRINTF("\n *** final computation and display at cycle %d (%d)\n", date, delta); for ( l = 0 ; l < lines_per_task ; l++) { for ( p = 0 ; p < NP ; p++) { TD(cid,l,p) = TD(cid,l,p) + TC(cid,l,p); line_buf[p] = (unsigned char)(TD(cid,l,p)); } int xxx = ( fb_write( NP*(cid*lines_per_cluster+lid*lines_per_task+l), line_buf, NP) ); if ( xxx ) { PRINTF("echec fb_write = %d\n", xxx); while(1); } if ( fb_completed() ) { PRINTF("echec fb_completed\n"); while(1); } PRINTF(" - line %d displayed at cycle %d\n", l, proctime()); } delta = proctime() - date; date = date + delta; PRINTF(" *** completing display at cycle %d (%d)\n", date, delta); while(1); } // end main()