#include "limits.h" #include "stdio.h" #include "../giet_tsar/block_device.h" //////////////////////////////////// // Image parameters #define NB_CLUSTER_MAX 256 #define PIXEL_SIZE 2 #define NL 1024 #define NP 1024 #define NB_PIXELS ((NP) * (NL)) #define FRAME_SIZE ((NB_PIXELS) * (PIXEL_SIZE)) #define PRINTF(...) ({ if (proc_id == 0) { tty_printf(__VA_ARGS__); } }) #define TA(c,l,p) (A[c][((NP) * (l)) + (p)]) #define TB(c,p,l) (B[c][((NL) * (p)) + (l)]) #define TC(c,l,p) (C[c][((NP) * (l)) + (p)]) #define TD(c,l,p) (D[c][((NP) * (l)) + (p)]) #define TZ(c,l,p) (Z[c][((NP) * (l)) + (p)]) #define max(x,y) ((x) > (y) ? (x) : (y)) #define min(x,y) ((x) < (y) ? (x) : (y)) /////////////////////////////////////////// // tricks to read parameters from ldscript /////////////////////////////////////////// struct plaf; extern struct plouf seg_ioc_base; extern struct plaf seg_heap_base; extern struct plaf NB_PROCS; extern struct plaf NB_CLUSTERS; // Required when initializing an array all at once static void *memcpy(void *_dst, const void *_src, unsigned int size){ unsigned int *dst = _dst; const unsigned int *src = _src; if (! ((unsigned int)dst & 3) && ! ((unsigned int)src & 3)){ while (size > 3){ *dst++ = *src++; size -= 4; } } unsigned char *cdst = (unsigned char*)dst; unsigned char *csrc = (unsigned char*)src; while (size--){ *cdst++ = *csrc++; } return _dst; } void main(){ ////////////////////////////////// // convolution kernel parameters // The content of this section is // Philips proprietary information. /////////////////////////////////// int vnorm = 115; int vf[35] = { 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2, 1, 1 }; int hrange = 100; int hnorm = 201; unsigned int date = 0; int c; // cluster index for loops int l; // line index for loops int p; // pixel index for loops int x; // filter index for loops const unsigned int proc_id = procid(); // processor id const unsigned int nlocal_procs = (int) &NB_PROCS; // number of processors per cluster const unsigned int nclusters = (int) &NB_CLUSTERS; // number of clusters const unsigned int local_id = proc_id % nlocal_procs; // local task id const unsigned int cluster_id = proc_id / nlocal_procs; // cluster task id const unsigned int base = (unsigned int) &seg_heap_base; // base address for shared buffers const unsigned int increment = 0x80000000 / nclusters * 2; // cluster increment const unsigned int nglobal_procs = nclusters * nlocal_procs; // number of tasks const unsigned int npixels = NB_PIXELS; // Number of pixel per frame const unsigned int frame_size = FRAME_SIZE; // Size of 1 frame (in bytes) const unsigned int * ioc_address = (unsigned int *) &seg_ioc_base; const unsigned int block_size = ioc_address[BLOCK_DEVICE_BLOCK_SIZE]; const unsigned int nblocks = frame_size / block_size; // number of blocks per frame const unsigned int lines_per_task = NL / nglobal_procs; // number of lines per task const unsigned int lines_per_cluster = NL / nclusters; // number of lines per cluster const unsigned int pixels_per_task = NP / nglobal_procs; // number of columns per task const unsigned int pixels_per_cluster = NP / nclusters; // number of columns per cluster int first, last; PRINTF("\n*** Processor %d entering main at cycle %d ***\n\n", proc_id, proctime()); ///////////////////////// // parameters checking // ///////////////////////// if ((nlocal_procs != 1) && (nlocal_procs != 2) && (nlocal_procs != 4)){ PRINTF("NB_PROCS must be 1, 2 or 4\n"); exit(); } //////////////////////////////////////////////////////////////////////// // Warning: NB_CLUSTERS must be at least 4 because of the heap size; // // if there are less clusters, the heap mixes with the stack // // (the total heap size must be at least 0x01000000) // //////////////////////////////////////////////////////////////////////// if ((nclusters != 4) && (nclusters != 8) && (nclusters != 16) && (nclusters != 32) && (nclusters != 64) && (nclusters !=128) && (nclusters != 256)){ PRINTF("NB_CLUSTERS must be a power of 2 between 4 and 256\n"); exit(); } if (proc_id >= nglobal_procs){ PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", proc_id); exit(); } if (NL % nclusters != 0){ PRINTF("NB_CLUSTERS must be a divider of NL"); exit(); } if (NP % nclusters != 0){ PRINTF("NB_CLUSTERS must be a divider of NP"); exit(); } // Arrays of pointers on the shared, distributed buffers // containing the images (sized for the worst case : 256 clusters) unsigned short * A[NB_CLUSTER_MAX]; int * B[NB_CLUSTER_MAX]; int * C[NB_CLUSTER_MAX]; int * D[NB_CLUSTER_MAX]; unsigned char * Z[NB_CLUSTER_MAX]; // Arrays of pointers on the instrumentation arrays // These arrays are indexed by the cluster index (sized for the worst case : 256 clusters) // each pointer points on the base adress of an array of 4 (NPROCS max) unsigned int unsigned int * LOAD_START[NB_CLUSTER_MAX]; unsigned int * LOAD_END[NB_CLUSTER_MAX]; unsigned int * VERT_START[NB_CLUSTER_MAX]; unsigned int * VERT_END[NB_CLUSTER_MAX]; unsigned int * HORI_START[NB_CLUSTER_MAX]; unsigned int * HORI_END[NB_CLUSTER_MAX]; unsigned int * DISP_START[NB_CLUSTER_MAX]; unsigned int * DISP_END[NB_CLUSTER_MAX]; // The shared, distributed buffers addresses are computed // from the seg_heap_base value defined in the ldscript file // and from the cluster increment = 4Gbytes/nclusters. // These arrays of pointers are identical and // replicated in the stack of each task for (c = 0; c < nclusters; c++){ unsigned int offset = base + increment * c; A[c] = (unsigned short *) (offset); B[c] = (int *) (offset + frame_size * 1 / nclusters); // We increment by 2 * frame_size C[c] = (int *) (offset + frame_size * 3 / nclusters); // because sizeof(int) = 2*sizeof(short) D[c] = (int *) (offset + frame_size * 5 / nclusters); // so an array of frame_size elements of type Z[c] = (unsigned char *) (offset + frame_size * 7 / nclusters); // int can contain the equivalent of 2 frames offset = base + increment * c + frame_size * 8 / nclusters; LOAD_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 0); LOAD_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 1); VERT_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 2); VERT_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 3); HORI_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 4); HORI_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 5); DISP_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 6); DISP_END[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 7); } PRINTF("NB_CLUSTERS = %d\n", nclusters); PRINTF("NB_LOCAL_PROCS = %d\n", nlocal_procs); PRINTF("NB_GLOBAL_PROCS = %d\n", nglobal_procs); PRINTF("NB_PIXELS = %d\n", npixels); PRINTF("PIXEL_SIZE = %d\n", PIXEL_SIZE); PRINTF("FRAME_SIZE = %d\n", frame_size); PRINTF("BLOCK_SIZE = %d\n", block_size); PRINTF("NB_BLOCKS = %d\n\n", nblocks); PRINTF("*** Starting barrier init at cycle %d ***\n", proctime()); // barriers initialization barrier_init(0, nglobal_procs); barrier_init(1, nglobal_procs); barrier_init(2, nglobal_procs); barrier_init(3, nglobal_procs); PRINTF("*** Completing barrier init at cycle %d ***\n", proctime()); //////////////////////////////////////////////////////// // pseudo parallel load from disk to A[c] buffers // only task running on processor with (local_id==0) does it // nblocks/nclusters are loaded in each cluster //////////////////////////////////////////////////////// if (local_id == 0){ int p; date = proctime(); PRINTF("\n*** Starting load at cycle %d\n", date); for (p = 0; p < nlocal_procs; p++){ LOAD_START[cluster_id][p] = date; } if (ioc_read(nblocks*cluster_id/nclusters, A[cluster_id], nblocks/nclusters)){ PRINTF("echec ioc_read\n"); exit(1); } if (ioc_completed()){ PRINTF("echec ioc_completed\n"); exit(1); } date = proctime(); PRINTF("*** Completing load at cycle %d\n", date); for (p = 0; p < nlocal_procs; p++){ LOAD_END[cluster_id][p] = date; } } barrier_wait(0); //////////////////////////////////////////////////////// // parallel horizontal filter : // B <= transpose(FH(A)) // D <= A - FH(A) // Each task computes (NL/nglobal_procs) lines // The image must be extended : // if (z<0) TA(cluster_id,l,z) == TA(cluster_id,l,0) // if (z>NP-1) TA(cluster_id,l,z) == TA(cluster_id,l,NP-1) //////////////////////////////////////////////////////// date = proctime(); PRINTF("\n*** Starting horizontal filter at cycle %d\n", date); HORI_START[cluster_id][local_id] = date; // l = absolute line index / p = absolute pixel index // first & last define which lines are handled by a given task(cluster_id,local_id) first = (cluster_id * nlocal_procs + local_id) * lines_per_task; last = first + lines_per_task; for (l = first; l < last; l++){ // src_c and src_l are the cluster index and the line index for A & D int src_c = l / lines_per_cluster; int src_l = l % lines_per_cluster; // We use the specific values of the horizontal ep-filter for optimisation: // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1] // To minimize the number of tests, the loop on pixels is split in three domains int sum_p = (hrange + 2) * TA(src_c, src_l, 0); for (x = 1; x < hrange; x++){ sum_p = sum_p + TA(src_c, src_l, x); } // first domain : from 0 to hrange for (p = 0; p < hrange + 1; p++){ // dst_c and dst_p are the cluster index and the pixel index for B int dst_c = p / pixels_per_cluster; int dst_p = p % pixels_per_cluster; sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, 0); TB(dst_c, dst_p, l) = sum_p / hnorm; TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; } // second domain : from (hrange+1) to (NP-hrange-1) for (p = hrange + 1; p < NP - hrange; p++){ // dst_c and dst_p are the cluster index and the pixel index for B int dst_c = p / pixels_per_cluster; int dst_p = p % pixels_per_cluster; sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, p - hrange - 1); TB(dst_c, dst_p, l) = sum_p / hnorm; TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; } // third domain : from (NP-hrange) to (NP-1) for (p = NP - hrange; p < NP; p++){ // dst_c and dst_p are the cluster index and the pixel index for B int dst_c = p / pixels_per_cluster; int dst_p = p % pixels_per_cluster; sum_p = sum_p + (int) TA(src_c, src_l, NP - 1) - (int) TA(src_c, src_l, p - hrange - 1); TB(dst_c, dst_p, l) = sum_p / hnorm; TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm; } PRINTF(" - line %d computed at cycle %d\n", l, proctime()); } date = proctime(); PRINTF("*** Completing horizontal filter at cycle %d\n", date); HORI_END[cluster_id][local_id] = date; barrier_wait(1); ////////////////////////////////////////////////////////// // parallel vertical filter : // C <= transpose(FV(B)) // Each task computes (NP/nglobal_procs) columns // The image must be extended : // if (l<0) TB(cluster_id,p,x) == TB(cluster_id,p,0) // if (l>NL-1) TB(cluster_id,p,x) == TB(cluster_id,p,NL-1) ////////////////////////////////////////////////////////// date = proctime(); PRINTF("\n*** starting vertical filter at cycle %d\n", date); VERT_START[cluster_id][local_id] = date; // l = absolute line index / p = absolute pixel index // first & last define which pixels are handled by a given task(cluster_id,local_id) first = (cluster_id * nlocal_procs + local_id) * pixels_per_task; last = first + pixels_per_task; for (p = first; p < last; p++){ // src_c and src_p are the cluster index and the pixel index for B int src_c = p / pixels_per_cluster; int src_p = p % pixels_per_cluster; int sum_l; // We use the specific values of the vertical ep-filter // To minimize the number of tests, the NL lines are split in three domains // first domain : explicit computation for the first 18 values for (l = 0; l < 18; l++){ // dst_c and dst_l are the cluster index and the line index for C int dst_c = l / lines_per_cluster; int dst_l = l % lines_per_cluster; for (x = 0, sum_l = 0; x < 35; x++){ sum_l = sum_l + vf[x] * TB(src_c, src_p, max(l - 17 + x,0) ); } TC(dst_c, dst_l, p) = sum_l / vnorm; } // second domain for (l = 18; l < NL - 17; l++){ // dst_c and dst_l are the cluster index and the line index for C int dst_c = l / lines_per_cluster; int dst_l = l % lines_per_cluster; sum_l = sum_l + TB(src_c, src_p, l + 4) + TB(src_c, src_p, l + 8) + TB(src_c, src_p, l + 11) + TB(src_c, src_p, l + 15) + TB(src_c, src_p, l + 17) - TB(src_c, src_p, l - 5) - TB(src_c, src_p, l - 9) - TB(src_c, src_p, l - 12) - TB(src_c, src_p, l - 16) - TB(src_c, src_p, l - 18); TC(dst_c, dst_l, p) = sum_l / vnorm; } // third domain for (l = NL - 17; l < NL; l++){ // dst_c and dst_l are the cluster index and the line index for C int dst_c = l / lines_per_cluster; int dst_l = l % lines_per_cluster; sum_l = sum_l + TB(src_c, src_p, min(l + 4, NL - 1)) + TB(src_c, src_p, min(l + 8, NL - 1)) + TB(src_c, src_p, min(l + 11, NL - 1)) + TB(src_c, src_p, min(l + 15, NL - 1)) + TB(src_c, src_p, min(l + 17, NL - 1)) - TB(src_c, src_p, l - 5) - TB(src_c, src_p, l - 9) - TB(src_c, src_p, l - 12) - TB(src_c, src_p, l - 16) - TB(src_c, src_p, l - 18); TC(dst_c, dst_l, p) = sum_l / vnorm; } PRINTF(" - column %d computed at cycle %d\n", p, proctime()); } date = proctime(); PRINTF("*** Completing vertical filter at cycle %d\n", date); VERT_END[cluster_id][local_id] = date; barrier_wait(2); //////////////////////////////////////////////////////////////// // final computation and parallel display // Z <= D + C // Each processor use its private DMA channel to display // the resulting image, line per line (one byte per pixel). // Eah processor computes & displays (NL/nglobal_procs) lines. //////////////////////////////////////////////////////////////// date = proctime(); PRINTF("\n*** Starting display at cycle %d\n", date); DISP_START[cluster_id][local_id] = date; first = local_id * lines_per_task; last = first + lines_per_task; for (l = first; l < last; l++){ for (p = 0; p < NP; p++){ TZ(cluster_id,l,p) = (unsigned char) (((TD(cluster_id,l,p) + TC(cluster_id,l,p)) >> 8) & 0xFF); } fb_write(NP * (cluster_id * lines_per_cluster + l), &TZ(cluster_id,l,0), NP); } date = proctime(); PRINTF("*** Completing display at cycle %d\n", date); DISP_END[cluster_id][local_id] = date; barrier_wait(3); ///////////////////////////////////////////////////////// // Instrumentation (done by processor 0 in cluster 0) ///////////////////////////////////////////////////////// if (proc_id == 0){ date = proctime(); PRINTF("\n*** Starting Instrumentation at cycle %d\n\n", date); int cc, pp; unsigned int min_load_start = INT_MAX; unsigned int max_load_start = 0; unsigned int min_load_ended = INT_MAX; unsigned int max_load_ended = 0; unsigned int min_hori_start = INT_MAX; unsigned int max_hori_start = 0; unsigned int min_hori_ended = INT_MAX; unsigned int max_hori_ended = 0; unsigned int min_vert_start = INT_MAX; unsigned int max_vert_start = 0; unsigned int min_vert_ended = INT_MAX; unsigned int max_vert_ended = 0; unsigned int min_disp_start = INT_MAX; unsigned int max_disp_start = 0; unsigned int min_disp_ended = INT_MAX; unsigned int max_disp_ended = 0; for (cc = 0; cc < nclusters; cc++){ for (pp = 0; pp < nlocal_procs; pp++ ){ if (LOAD_START[cc][pp] < min_load_start){ min_load_start = LOAD_START[cc][pp]; } if (LOAD_START[cc][pp] > max_load_start){ max_load_start = LOAD_START[cc][pp]; } if (LOAD_END[cc][pp] < min_load_ended){ min_load_ended = LOAD_END[cc][pp]; } if (LOAD_END[cc][pp] > max_load_ended){ max_load_ended = LOAD_END[cc][pp]; } if (HORI_START[cc][pp] < min_hori_start){ min_hori_start = HORI_START[cc][pp]; } if (HORI_START[cc][pp] > max_hori_start){ max_hori_start = HORI_START[cc][pp]; } if (HORI_END[cc][pp] < min_hori_ended){ min_hori_ended = HORI_END[cc][pp]; } if (HORI_END[cc][pp] > max_hori_ended){ max_hori_ended = HORI_END[cc][pp]; } if (VERT_START[cc][pp] < min_vert_start){ min_vert_start = VERT_START[cc][pp]; } if (VERT_START[cc][pp] > max_vert_start){ max_vert_start = VERT_START[cc][pp]; } if (VERT_END[cc][pp] < min_vert_ended){ min_vert_ended = VERT_END[cc][pp]; } if (VERT_END[cc][pp] > max_vert_ended){ max_vert_ended = VERT_END[cc][pp]; } if (DISP_START[cc][pp] < min_disp_start){ min_disp_start = DISP_START[cc][pp]; } if (DISP_START[cc][pp] > max_disp_start){ max_disp_start = DISP_START[cc][pp]; } if (DISP_END[cc][pp] < min_disp_ended){ min_disp_ended = DISP_END[cc][pp]; } if (DISP_END[cc][pp] > max_disp_ended){ max_disp_ended = DISP_END[cc][pp]; } } } PRINTF(" - LOAD_START : min = %d / max = %d / med = %d / delta = %d\n", min_load_start, max_load_start, (min_load_start+max_load_start) / 2, max_load_start-min_load_start); PRINTF(" - LOAD_END : min = %d / max = %d / med = %d / delta = %d\n", min_load_ended, max_load_ended, (min_load_ended+max_load_ended) / 2, max_load_ended-min_load_ended); PRINTF(" - HORI_START : min = %d / max = %d / med = %d / delta = %d\n", min_hori_start, max_hori_start, (min_hori_start+max_hori_start) / 2, max_hori_start-min_hori_start); PRINTF(" - HORI_END : min = %d / max = %d / med = %d / delta = %d\n", min_hori_ended, max_hori_ended, (min_hori_ended+max_hori_ended) / 2, max_hori_ended-min_hori_ended); PRINTF(" - VERT_START : min = %d / max = %d / med = %d / delta = %d\n", min_vert_start, max_vert_start, (min_vert_start+max_vert_start) / 2, max_vert_start-min_vert_start); PRINTF(" - VERT_END : min = %d / max = %d / med = %d / delta = %d\n", min_vert_ended, max_vert_ended, (min_vert_ended+max_vert_ended) / 2, max_vert_ended-min_vert_ended); PRINTF(" - DISP_START : min = %d / max = %d / med = %d / delta = %d\n", min_disp_start, max_disp_start, (min_disp_start+max_disp_start) / 2, max_disp_start-min_disp_start); PRINTF(" - DISP_END : min = %d / max = %d / med = %d / delta = %d\n", min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended) / 2, max_disp_ended-min_disp_ended); PRINTF(" - BARRIER LOAD/HORI = %d\n", min_hori_start - max_load_ended); PRINTF(" - BARRIER HORI/VERT = %d\n", min_vert_start - max_hori_ended); PRINTF(" - BARRIER VERT/DISP = %d\n", min_disp_start - max_vert_ended); PRINTF(" - LOAD = %d\n", max_load_ended); PRINTF(" - FILTER = %d\n", max_vert_ended - max_load_ended); PRINTF(" - DISPLAY = %d\n", max_disp_ended - max_vert_ended); PRINTF("\nBEGIN LOAD_START\n"); for (cc = 0; cc < nclusters; cc++){ for (pp = 0; pp < nlocal_procs; pp++){ PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_START[cc][pp]); } } PRINTF("END\n"); PRINTF("\nBEGIN LOAD_END\n"); for (cc = 0; cc < nclusters; cc++){ for (pp = 0; pp < nlocal_procs; pp++){ PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_END[cc][pp]); } } PRINTF("END\n"); PRINTF("\nBEGIN HORI_START\n"); for (cc = 0; cc < nclusters; cc++){ for (pp = 0; pp < nlocal_procs; pp++){ PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_START[cc][pp]); } } PRINTF("END\n"); PRINTF("\nBEGIN HORI_END\n"); for (cc = 0; cc < nclusters; cc++){ for (pp = 0; pp < nlocal_procs; pp++){ PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_END[cc][pp]); } } PRINTF("END\n"); PRINTF("\nBEGIN VERT_START\n"); for (cc = 0; cc < nclusters; cc++){ for (pp = 0; pp < nlocal_procs; pp++){ PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_START[cc][pp]); } } PRINTF("END\n"); PRINTF("\nBEGIN VERT_END\n"); for (cc = 0; cc < nclusters; cc++){ for (pp = 0; pp < nlocal_procs; pp++ ){ PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_END[cc][pp]); } } PRINTF("END\n"); PRINTF("\nBEGIN DISP_START\n"); for (cc = 0; cc < nclusters; cc++){ for (pp = 0; pp < nlocal_procs; pp++){ PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_START[cc][pp]); } } PRINTF("END\n"); PRINTF("\nBEGIN DISP_END\n"); for (cc = 0; cc < nclusters; cc++){ for (pp = 0; pp < nlocal_procs; pp++){ PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_END[cc][pp]); } } PRINTF("END\n"); } while(1); } // end main() // Local Variables: // tab-width: 3 // c-basic-offset: 3 // c-file-offsets:((innamespace . 0)(inline-open . 0)) // indent-tabs-mode: nil // End: // vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3