Changeset 248


Ignore:
Timestamp:
Aug 9, 2012, 10:57:23 AM (12 years ago)
Author:
meunier
Message:

Updates in the soft_filter application (bug corrections, formatting, and adaptation to the architecture generic_mmu)

Location:
trunk/softs
Files:
8 edited

Legend:

Unmodified
Added
Removed
  • trunk/softs/giet_tsar/dma.h

    r158 r248  
    99    DMA_IRQ_DISABLE = 4,
    1010    /***/
    11     DMA_SPAN        = 8,
     11    DMA_SPAN        = 0x400,
    1212};
    1313
    1414enum DmaStatusValues {
    15     DMA_IDLE        = 0,
    16     DMA_SUCCESS     = 1,
    17     DMA_READ_ERROR  = 2,
     15    DMA_SUCCESS     = 0,
     16    DMA_READ_ERROR  = 1,
     17    DMA_IDLE        = 2,
    1818    DMA_WRITE_ERROR = 3,
    1919};
  • trunk/softs/giet_tsar/drivers.c

    r178 r248  
    376376
    377377    tty_address = (char*)(base + increment + tid*TTY_SPAN*4);
     378    //tty_address = (char*)(base + tid*TTY_SPAN*4);
    378379
    379380    for ( i=0 ; i < length ; i++ )
     
    569570    if( index >= max ) return -1;
    570571
    571     register int        delay = ( (_proctime() + _procid() ) & 0xF) << 4;
    572     register int*       plock = (int*)&_spin_lock[index];                       
     572    register int   delay = ((_proctime() +_procid()) & 0xF) << 4;
     573    register int * plock = (int *) &_spin_lock[index];                 
    573574
    574575    asm volatile ("_locks_llsc:                 \n"
     
    769770{
    770771    int*                dma_address;
    771     unsigned int        base            = (unsigned int)&seg_dma_base;
     772    unsigned int        base            = (unsigned int) &seg_dma_base;
    772773    unsigned int        increment       = _segment_increment(DMA_SPAN*4);
    773     char*               fb              = (char*)&seg_fb_base + offset;
    774     unsigned int        delay           = (_proctime() & 0xF) << 4;
     774    char *      fb              = (char *) &seg_fb_base + offset;
     775    unsigned int        delay = (_proctime() & 0xF) << 4;
    775776    unsigned int        pid             = _procid();
    776777    unsigned int        i;
  • trunk/softs/giet_tsar/isr.c

    r158 r248  
    183183   End: */
    184184
    185 /* vim: set filetype=asm expandtab shiftwidth=4 tabstop=4 softtabstop=4: */
     185/* vim: set filetype=c expandtab shiftwidth=4 tabstop=4 softtabstop=4: */
    186186
  • trunk/softs/giet_tsar/reset.s

    r163 r248  
    107107    nop
    108108    la      $26,    _interrupt_vector   # interrupt vector address
     109    la      $27,    _isr_dma
     110    sw      $27,    0($26)              # interrupt_vector[0] <= _isr_dma_get
     111    sw      $27,    4($26)              # interrupt_vector[1] <= _isr_dma_get
     112    sw      $27,    8($26)              # interrupt_vector[2] <= _isr_dma_get
     113    sw      $27,   12($26)              # interrupt_vector[3] <= _isr_dma_get
    109114    la      $27,    _isr_tty_get
    110     sw      $27,    0($26)              # interrupt_vector[0] <= _isr_tty_get
    111     sw      $27,    4($26)              # interrupt_vector[1] <= _isr_tty_get
    112     sw      $27,    8($26)              # interrupt_vector[2] <= _isr_tty_get
    113     sw      $27,   12($26)              # interrupt_vector[3] <= _isr_tty_get
    114     la      $27,    _isr_dma
    115     sw      $27,   16($26)              # interrupt_vector[4] <= _isr_dma
    116     sw      $27,   20($26)              # interrupt_vector[5] <= _isr_dma
    117     sw      $27,   24($26)              # interrupt_vector[6] <= _isr_dma
    118     sw      $27,   28($26)              # interrupt_vector[7] <= _isr_dma
     115    sw      $27,   16($26)              # interrupt_vector[4] <= _isr_tty
     116    sw      $27,   20($26)              # interrupt_vector[5] <= _isr_tty
     117    sw      $27,   24($26)              # interrupt_vector[6] <= _isr_tty
     118    sw      $27,   28($26)              # interrupt_vector[7] <= _isr_tty
    119119    la      $27,    _isr_ioc
    120120    sw      $27,   32($26)              # interrupt_vector[8] <= _isr_ioc
  • trunk/softs/soft_filter_giet/Makefile

    r163 r248  
    1 LD=mipsel-unknown-elf-ld
    2 CC=mipsel-unknown-elf-gcc
    3 AS=mipsel-unknown-elf-as
    4 DU=mipsel-unknown-elf-objdump
    51
    6 OBJS=   reset.o \
     2LD = mipsel-unknown-elf-ld
     3CC = mipsel-unknown-elf-gcc
     4AS = mipsel-unknown-elf-as
     5DU = mipsel-unknown-elf-objdump
     6
     7OBJS = reset.o \
    78        giet.o \
    89        isr.o \
     
    1112        main.o
    1213
    13 CFLAGS= -Wall -mno-gpopt -ffreestanding -fomit-frame-pointer -mips32 -ggdb
     14CFLAGS = -Wall -g -mno-gpopt -ffreestanding -fomit-frame-pointer -mips32 -ggdb
    1415
    15 GIET=   ../giet_tsar
     16GIET =  ../giet_tsar
    1617
    1718bin.soft: $(OBJS) ldscript
  • trunk/softs/soft_filter_giet/ldscript

    r174 r248  
    1010peripherals are not present in the architecture */
    1111
    12 NB_CLUSTERS             = 64;           /* number of clusters */
     12NB_CLUSTERS             = 4;            /* number of clusters */
    1313NB_PROCS                = 4;            /* number of processors per cluster */
    1414NB_TASKS                = 1;            /* number of tasks per processor */
     
    3232
    3333seg_icu_base    = 0x00F00000;       /* controleur ICU */
    34 seg_tty_base    = 0x00F10000;       /* controleur TTY */
    35 seg_dma_base    = 0x00F20000;       /* controleur DMA */
     34seg_tty_base    = 0xBFF20000;       /* controleur TTY */
     35seg_dma_base    = 0x00F30000;       /* controleur DMA */
    3636
    3737seg_reset_base  = 0xBFC00000;       /* le code de boot */
    3838seg_fb_base     = 0xBFD00000;       /* controleur FRAME BUFFER */
    39 seg_ioc_base    = 0xBFF30000;       /* controleur I/O */
     39seg_ioc_base    = 0xBFF10000;       /* controleur I/O */
    4040
    41 seg_timer_base  = 0xBFF40000;       /* controleur TIMER */
     41seg_timer_base  = 0x00F0000;       /* controleur TIMER */
    4242seg_gcd_base    = 0xBFF50000;       /* controleur GCD */
    4343
  • trunk/softs/soft_filter_giet/main.c

    r174 r248  
     1
     2#include "limits.h"
    13#include "stdio.h"
     4
     5#include "../giet_tsar/block_device.h"
    26
    37////////////////////////////////////
    48// Image parameters
    59
    6 #define PIXEL_SIZE      2
    7 #define NL              1024
    8 #define NP              1024
    9 #define BLOCK_SIZE      1024
    10 
    11 #define PRINTF          if(lid==0) tty_printf
    12 
    13 #define TA(c,l,p)  (A[c][((NP)*(l))+(p)])
    14 #define TB(c,p,l)  (B[c][((NL)*(p))+(l)])
    15 #define TC(c,l,p)  (C[c][((NP)*(l))+(p)])
    16 #define TD(c,l,p)  (D[c][((NP)*(l))+(p)])
    17 #define TZ(c,l,p)  (Z[c][((NP)*(l))+(p)])
     10#define NB_CLUSTER_MAX 256
     11#define PIXEL_SIZE     2
     12#define NL             1024
     13#define NP             1024
     14
     15#define NB_PIXELS      ((NP) * (NL))
     16#define FRAME_SIZE     ((NB_PIXELS) * (PIXEL_SIZE))
     17
     18#define PRINTF(...)      ({ if (proc_id == 1) { tty_printf(__VA_ARGS__); } })
     19
     20#define TA(c,l,p)  (A[c][((NP) * (l)) + (p)])
     21#define TB(c,p,l)  (B[c][((NL) * (p)) + (l)])
     22#define TC(c,l,p)  (C[c][((NP) * (l)) + (p)])
     23#define TD(c,l,p)  (D[c][((NP) * (l)) + (p)])
     24#define TZ(c,l,p)  (Z[c][((NP) * (l)) + (p)])
    1825
    1926#define max(x,y) ((x) > (y) ? (x) : (y))
     
    2633struct plaf;
    2734
     35extern struct plouf seg_ioc_base;
    2836extern struct plaf seg_heap_base;
    2937extern struct plaf NB_PROCS;
    3038extern struct plaf NB_CLUSTERS;
    3139
    32 /////////////
    33 void main()
    34 {
    35 
    36 //////////////////////////////////
    37 // convolution kernel parameters
    38 // The content of this section is
    39 // Philips proprietary information.
    40 ///////////////////////////////////
    41 
    42     int vnorm  = 115;
    43     int vf[35];
    44     vf[0]  = 1;
    45     vf[1]  = 1;
    46     vf[2]  = 2;
    47     vf[3]  = 2;
    48     vf[4]  = 2;
    49     vf[5]  = 2;
    50     vf[6]  = 3;
    51     vf[7]  = 3;
    52     vf[8]  = 3;
    53     vf[9]  = 4;
    54     vf[10] = 4;
    55     vf[11] = 4;
    56     vf[12] = 4;
    57     vf[13] = 5;
    58     vf[14] = 5;
    59     vf[15] = 5;
    60     vf[16] = 5;
    61     vf[17] = 5;
    62     vf[18] = 5;
    63     vf[19] = 5;
    64     vf[20] = 5;
    65     vf[21] = 5;
    66     vf[22] = 4;
    67     vf[23] = 4;
    68     vf[24] = 4;
    69     vf[25] = 4;
    70     vf[26] = 3;
    71     vf[27] = 3;
    72     vf[28] = 3;
    73     vf[29] = 2;
    74     vf[30] = 2;
    75     vf[31] = 2;
    76     vf[32] = 2;
    77     vf[33] = 1;
    78     vf[34] = 1;
    79 
    80     int hrange = 100;
    81     int hnorm  = 201;
    82 
    83     unsigned int date      = 0;
    84 
    85     int c;                                                      // cluster index for loops
    86     int l;                                                      // line index for loops
    87     int p;                                                      // pixel index for loops
    88     int x;                                                      // filter index for loops
    89 
    90     int pid                 = procid();                         // processor id
    91     int nprocs              = (int)&NB_PROCS;                   // number of processors per cluster
    92     int nclusters           = (int)&NB_CLUSTERS;                // number of clusters
    93     int lid                 = pid%nprocs;                       // local task id
    94     int cid                 = pid/nprocs;                       // cluster task id
    95     int base                = (unsigned int)&seg_heap_base;     // base address for shared buffers
    96     int increment           = (0x80000000 / nclusters) * 2;     // cluster increment
    97     int ntasks              = nclusters * nprocs;               // number of tasks
    98     int nblocks             = (NP*NL*PIXEL_SIZE)/BLOCK_SIZE;    // number of blocks per image
    99 
    100     int lines_per_task      = NL/ntasks;                        // number of lines per task
    101     int lines_per_cluster   = NL/nclusters;                     // number of lines per cluster
    102     int pixels_per_task     = NP/ntasks;                        // number of columns per task
    103     int pixels_per_cluster  = NP/nclusters;                     // number of columns per cluster
    104 
    105     int first, last;
    106 
    107     PRINTF("\n*** Processor %d entering main at cycle %d ***\n\n", pid, proctime());
    108    
    109     //////////////////////////
    110     //  parameters checking
    111     if( (nprocs != 1) && (nprocs != 2) && (nprocs != 4) )
    112     {
    113         PRINTF("NB_PROCS must be 1, 2 or 4\n");
    114         while(1);
    115     }
    116     if( (nclusters !=  4) && (nclusters !=  8) && (nclusters != 16) &&
    117         (nclusters != 32) && (nclusters != 64) && (nclusters !=128) && (nclusters != 256) )
    118     {
    119         PRINTF("NB_CLUSTERS must be a power of 2 between 4 and 256\n");
    120         while(1);
    121     }
    122     if( pid >= ntasks )
    123     {
    124         PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", pid);
    125         while(1);
    126     }
    127     if ( NL % nclusters != 0 )
    128     {
    129         PRINTF("NB_CLUSTERS must be a divider of NL");
    130         while(1);
    131     }
    132     if( NP % nclusters != 0 )
    133     {
    134         PRINTF("NB_CLUSTERS must be a divider of NP");
    135         while(1);
    136     }
    137 
    138     //////////////////////////////////////////////////////////////////
    139     // Arrays of pointers on the shared, distributed buffers 
    140     // containing the images (sized for the worst case : 256 clusters)
    141     unsigned short*     A[256];
    142     int*                B[256];
    143     int*                C[256];
    144     int*                D[256];
    145     unsigned char*      Z[256];
    146    
    147     // Arrays of pointers on the instrumentation arrays
    148     // These arrays are indexed by the cluster index (sized for the worst case : 256 clusters)
    149     // each pointer points on the base adress of an array of 4 (NPROCS max) unsigned int
    150     unsigned int*       LOAD_START[256];
    151     unsigned int*       LOAD_ENDED[256];
    152     unsigned int*       VERT_START[256];
    153     unsigned int*       VERT_ENDED[256];
    154     unsigned int*       HORI_START[256];
    155     unsigned int*       HORI_ENDED[256];
    156     unsigned int*       DISP_START[256];
    157     unsigned int*       DISP_ENDED[256];
    158 
    159     // The shared, distributed buffers addresses are computed
    160     // from the seg_heap_base value defined in the ldscript file
    161     // and from the cluster increment = 4Gbytes/nclusters.
    162     // These arrays of pointers are identical and
    163     // replicated in the stack of each task
    164     for( c=0 ; c<nclusters ; c++)
    165     {
    166         A[c] = (unsigned short*)        (base                           + increment*c);
    167         Z[c] = (unsigned char*)         (base + 2*NP*NL/nclusters       + increment*c);
    168         B[c] = (int*)                   (base + 4*NP*NL/nclusters       + increment*c);
    169         C[c] = (int*)                   (base + 8*NP*NL/nclusters       + increment*c);
    170         D[c] = (int*)                   (base + 12*NP*NL/nclusters      + increment*c);
    171 
    172         LOAD_START[c] = (unsigned int*) (base + 3*NL*NP/nclusters       + increment*c);
    173         LOAD_ENDED[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 16  + increment*c);
    174         VERT_START[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 32  + increment*c);
    175         VERT_ENDED[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 48  + increment*c);
    176         HORI_START[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 64  + increment*c);
    177         HORI_ENDED[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 80  + increment*c);
    178         DISP_START[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 96  + increment*c);
    179         DISP_ENDED[c] = (unsigned int*) (base + 3*NL*NP/nclusters + 112 + increment*c);
    180     }
    181 
    182     PRINTF("NCLUSTERS = %d\n", nclusters);
    183     PRINTF("NPROCS    = %d\n\n", nprocs);
    184 
    185     PRINTF("*** Starting barrier init at cycle %d ***\n", proctime());
    186 
    187     //  barriers initialization
    188     barrier_init(0, ntasks);
    189     barrier_init(1, ntasks);
    190     barrier_init(2, ntasks);
    191     barrier_init(3, ntasks);
    192 
    193     PRINTF("*** Completing barrier init at cycle %d ***\n", proctime());
    194 
    195     ////////////////////////////////////////////////////////
    196     // pseudo parallel load from disk to A[c] buffers
    197     // only task running on processor with (lid==0) does it
    198     // nblocks/nclusters are loaded in each cluster
    199 
    200     if ( lid == 0 )
    201     {
    202         int p;
    203         date  = proctime();
    204         PRINTF("\n*** Starting load at cycle %d\n", date);
    205         for ( p=0 ; p<nprocs ; p++ ) LOAD_START[cid][p] = date;
    206 
    207         if( ioc_read(nblocks*cid/nclusters,
    208                      A[cid] ,
    209                      nblocks/nclusters) )
    210         {
    211             PRINTF("echec ioc_read\n");
    212             while(1);
    213         }
    214         if ( ioc_completed() )
    215         {
    216             PRINTF("echec ioc_completed\n");
    217             while(1);
    218         }
    219 
    220         date  = proctime();
    221         PRINTF("*** Completing load at cycle %d\n", date);
    222         for ( p=0 ; p<nprocs ; p++ ) LOAD_ENDED[cid][p] = date;
    223     }
    224 
    225     barrier_wait(0);
    226 
    227     //////////////////////////////////////////////////////////
    228     // parallel horizontal filter :
    229     // B <= transpose(FH(A))
    230     // D <= A - FH(A)
    231     // Each task computes (NL/ntasks) lines
    232     // The image must be extended :
    233     // if (z<0)         TA(cid,l,z) == TA(cid,l,0)
    234     // if (z>NP-1)      TA(cid,l,z) == TA(cid,l,NP-1)
    235 
    236     date  = proctime();
    237     PRINTF("\n*** Starting horizontal filter at cycle %d\n", date);
    238     HORI_START[cid][lid] = date;
    239 
    240     // l = absolute line index / p = absolute pixel index 
    241     // first & last define which lines are handled by a given task(cid,lid)
    242 
    243     first = (cid*nprocs + lid)*lines_per_task;
    244     last  = first + lines_per_task;
    245 
    246     for ( l=first ; l<last ; l++)
    247     {
    248         // src_c and src_l are the cluster index and the line index for A & D
    249         int src_c = l/lines_per_cluster;
    250         int src_l = l%lines_per_cluster;
    251 
    252         // We use the spécific values of the horizontal ep-filter for optimisation:
    253         // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1]
    254         // To minimize the number of tests, the loop on pixels is split in three domains
    255 
    256         int sum_p = (hrange+2)*TA(src_c, src_l, 0);
    257         for ( x = 1 ; x < hrange ; x++) sum_p = sum_p + TA(src_c, src_l, x);
    258 
    259         // first domain : from 0 to hrange
    260         for ( p=0 ; p<hrange+1 ; p++)
    261         {
    262             // dst_c and dst_p are the cluster index and the pixel index for B
    263             int dst_c = p/pixels_per_cluster;
    264             int dst_p = p%pixels_per_cluster;
    265             sum_p = sum_p + (int)TA(src_c, src_l, p+hrange) - (int)TA(src_c, src_l, 0);
    266             TB(dst_c, dst_p, l) = sum_p/hnorm;
    267             TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm;
    268         }
    269         // second domain : from (hrange+1) to (NP-hrange-1)
    270         for ( p = hrange+1 ; p < NP-hrange ; p++)
    271         {
    272             // dst_c and dst_p are the cluster index and the pixel index for B
    273             int dst_c = p/pixels_per_cluster;
    274             int dst_p = p%pixels_per_cluster;
    275             sum_p = sum_p + (int)TA(src_c, src_l, p+hrange) - (int)TA(src_c, src_l, p-hrange-1);
    276             TB(dst_c, dst_p, l) = sum_p/hnorm;
    277             TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm;
    278         }
    279         // third domain : from (NP-hrange) to (NP-1)
    280         for ( p = NP-hrange ; p < NP ; p++)
    281         {
    282             // dst_c and dst_p are the cluster index and the pixel index for B
    283             int dst_c = p/pixels_per_cluster;
    284             int dst_p = p%pixels_per_cluster;
    285             sum_p = sum_p + (int)TA(src_c, src_l, NP-1) - (int)TA(src_c, src_l, p-hrange-1);
    286             TB(dst_c, dst_p, l) = sum_p/hnorm;
    287             TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm;
    288         }
    289 
    290         PRINTF(" - line %d computed at cycle %d\n", l, proctime());
    291     }
    292 
    293     date  = proctime();
    294     PRINTF("*** Completing horizontal filter at cycle %d\n", date);
    295     HORI_ENDED[cid][lid] = date;
    296 
    297     barrier_wait(1);
    298 
    299     //////////////////////////////////////////////////////////
    300     // parallel vertical filter :
    301     // C <= transpose(FV(B))
    302     // Each task computes (NP/ntasks) columns
    303     // The image must be extended :
    304     // if (l<0)         TB(cid,p,x) == TB(cid,p,0)
    305     // if (l>NL-1)      TB(cid,p,x) == TB(cid,p,NL-1)
    306 
    307     date  = proctime();
    308     PRINTF("\n*** starting vertical filter at cycle %d\n", date);
    309     VERT_START[cid][lid] = date;
    310 
    311     // l = absolute line index / p = absolute pixel index
    312     // first & last define which pixels are handled by a given task(cid,lid)
    313 
    314     first = (cid*nprocs + lid)*pixels_per_task;
    315     last  = first + pixels_per_task;
    316 
    317     for ( p=first ; p<last ; p++)
    318     {
    319         // src_c and src_p are the cluster index and the pixel index for B
    320         int src_c = p/pixels_per_cluster;
    321         int src_p = p%pixels_per_cluster;
    322 
    323         int sum_l;
    324 
    325         // We use the specific values of the vertical ep-filter
    326         // To minimize the number of tests, the NL lines are split in three domains
    327 
    328         // first domain : explicit computation for the first 18 values
    329         for ( l=0 ; l<18 ; l++)
    330         {
    331             // dst_c and dst_l are the cluster index and the line index for C
    332             int dst_c = l/lines_per_cluster;
    333             int dst_l = l%lines_per_cluster;
    334 
    335             for ( x=0, sum_l=0 ; x<35 ; x++ )
    336             {
    337                 sum_l = sum_l + vf[x] * TB(src_c, src_p, max(l-17+x,0) );
    338             }
    339             TC(dst_c, dst_l, p) = sum_l/vnorm;
    340         }
    341         // second domain
    342         for ( l = 18 ; l < NL-17 ; l++ )
    343         {
    344             // dst_c and dst_l are the cluster index and the line index for C
    345             int dst_c = l/lines_per_cluster;
    346             int dst_l = l%lines_per_cluster;
    347 
    348             sum_l = sum_l + TB(src_c, src_p, l+4)
    349                           + TB(src_c, src_p, l+8)
    350                           + TB(src_c, src_p, l+11)
    351                           + TB(src_c, src_p, l+15)
    352                           + TB(src_c, src_p, l+17)
    353                           - TB(src_c, src_p, l-5)
    354                           - TB(src_c, src_p, l-9)
    355                           - TB(src_c, src_p, l-12)
    356                           - TB(src_c, src_p, l-16)
    357                           - TB(src_c, src_p, l-18);
    358             TC(dst_c, dst_l, p) = sum_l/vnorm;
    359         }
    360         // third domain
    361         for ( l = NL-17 ; l < NL ; l++ )
    362         {
    363             // dst_c and dst_l are the cluster index and the line index for C
    364             int dst_c = l/lines_per_cluster;
    365             int dst_l = l%lines_per_cluster;
    366 
    367             sum_l = sum_l + TB(src_c, src_p, min(l+4,NL-1))
    368                           + TB(src_c, src_p, min(l+8,NL-1))
    369                           + TB(src_c, src_p, min(l+11,NL-1))
    370                           + TB(src_c, src_p, min(l+15,NL-1))
    371                           + TB(src_c, src_p, min(l+17,NL-1))
    372                           - TB(src_c, src_p, l-5)
    373                           - TB(src_c, src_p, l-9)
    374                           - TB(src_c, src_p, l-12)
    375                           - TB(src_c, src_p, l-16)
    376                           - TB(src_c, src_p, l-18);
    377             TC(dst_c, dst_l, p) = sum_l/vnorm;
    378         }
    379         PRINTF(" - column %d computed at cycle %d\n", p, proctime());
    380     }
    381 
    382     date  = proctime();
    383     PRINTF("*** Completing vertical filter at cycle %d\n", date);
    384     VERT_ENDED[cid][lid] = date;
    385 
    386     barrier_wait(2);
    387 
    388     ////////////////////////////////////////////////////////////////
    389     // final computation and parallel display
    390     // Z <= D + C
    391     // Each processor use its private DMA channel to display
    392     // the resulting image, line  per line (one byte per pixel).
    393     // Eah processor computes & displays (NL/ntasks) lines.
    394 
    395     date  = proctime();
    396     PRINTF("\n*** Starting display at cycle %d\n", date);
    397     DISP_START[cid][lid] = date;
    398 
    399     first = lid*lines_per_task;
    400     last  = first + lines_per_task;
    401 
    402     for ( l=first ; l<last ; l++)
    403     {
    404         for ( p=0 ; p<NP ; p++)
    405         {
    406            TZ(cid,l,p) = (unsigned char)(((TD(cid,l,p) + TC(cid,l,p))>>8) & 0xFF);
    407         }
    408         fb_write(NP*(cid*lines_per_cluster+l), &TZ(cid,l,0), NP);
    409     }
    410 
    411     date  = proctime();
    412     PRINTF("*** Completing display at cycle %d\n", date);
    413     DISP_ENDED[cid][lid] = date;
    414 
    415     barrier_wait(3);
    416 
    417     /////////////////////////////////////////////////////////
    418     // Instrumentation (done by processor 0 in cluster 0)   
    419 
    420     if ( pid == 0 )
    421     {
    422         date  = proctime();
    423         PRINTF("\n*** Starting Instrumentation at cycle %d\n\n", date);
    424 
    425         int cc, pp;
    426         unsigned int min_load_start = 1000000000;
    427         unsigned int max_load_start = 0;
    428         unsigned int min_load_ended = 1000000000;
    429         unsigned int max_load_ended = 0;
    430 
    431         unsigned int min_hori_start = 1000000000;
    432         unsigned int max_hori_start = 0;
    433         unsigned int min_hori_ended = 1000000000;
    434         unsigned int max_hori_ended = 0;
    435 
    436         unsigned int min_vert_start = 1000000000;
    437         unsigned int max_vert_start = 0;
    438         unsigned int min_vert_ended = 1000000000;
    439         unsigned int max_vert_ended = 0;
    440 
    441         unsigned int min_disp_start = 1000000000;
    442         unsigned int max_disp_start = 0;
    443         unsigned int min_disp_ended = 1000000000;
    444         unsigned int max_disp_ended = 0;
    445 
    446         for ( cc=0 ; cc<nclusters ; cc++ )
    447         {
    448             for ( pp=0 ; pp<nprocs ; pp++ )
    449             {
    450                 if ( LOAD_START[cc][pp] < min_load_start ) min_load_start = LOAD_START[cc][pp];
    451                 if ( LOAD_START[cc][pp] > max_load_start ) max_load_start = LOAD_START[cc][pp];
    452                 if ( LOAD_ENDED[cc][pp] < min_load_ended ) min_load_ended = LOAD_ENDED[cc][pp];
    453                 if ( LOAD_ENDED[cc][pp] > max_load_ended ) max_load_ended = LOAD_ENDED[cc][pp];
    454 
    455                 if ( HORI_START[cc][pp] < min_hori_start ) min_hori_start = HORI_START[cc][pp];
    456                 if ( HORI_START[cc][pp] > max_hori_start ) max_hori_start = HORI_START[cc][pp];
    457                 if ( HORI_ENDED[cc][pp] < min_hori_ended ) min_hori_ended = HORI_ENDED[cc][pp];
    458                 if ( HORI_ENDED[cc][pp] > max_hori_ended ) max_hori_ended = HORI_ENDED[cc][pp];
    459 
    460                 if ( VERT_START[cc][pp] < min_vert_start ) min_vert_start = VERT_START[cc][pp];
    461                 if ( VERT_START[cc][pp] > max_vert_start ) max_vert_start = VERT_START[cc][pp];
    462                 if ( VERT_ENDED[cc][pp] < min_vert_ended ) min_vert_ended = VERT_ENDED[cc][pp];
    463                 if ( VERT_ENDED[cc][pp] > max_vert_ended ) max_vert_ended = VERT_ENDED[cc][pp];
    464 
    465                 if ( DISP_START[cc][pp] < min_disp_start ) min_disp_start = DISP_START[cc][pp];
    466                 if ( DISP_START[cc][pp] > max_disp_start ) max_disp_start = DISP_START[cc][pp];
    467                 if ( DISP_ENDED[cc][pp] < min_disp_ended ) min_disp_ended = DISP_ENDED[cc][pp];
    468                 if ( DISP_ENDED[cc][pp] > max_disp_ended ) max_disp_ended = DISP_ENDED[cc][pp];
    469             }
    470         }
    471         PRINTF(" - LOAD_START : min = %d / max = %d / med = %d / delta = %d\n",
    472         min_load_start, max_load_start, (min_load_start+max_load_start)/2, max_load_start-min_load_start);
    473         PRINTF(" - LOAD_END   : min = %d / max = %d / med = %d / delta = %d\n",
    474         min_load_ended, max_load_ended, (min_load_ended+max_load_ended)/2, max_load_ended-min_load_ended);
    475 
    476         PRINTF(" - HORI_START : min = %d / max = %d / med = %d / delta = %d\n",
    477         min_hori_start, max_hori_start, (min_hori_start+max_hori_start)/2, max_hori_start-min_hori_start);
    478         PRINTF(" - HORI_END   : min = %d / max = %d / med = %d / delta = %d\n",
    479         min_hori_ended, max_hori_ended, (min_hori_ended+max_hori_ended)/2, max_hori_ended-min_hori_ended);
    480 
    481         PRINTF(" - VERT_START : min = %d / max = %d / med = %d / delta = %d\n",
    482         min_vert_start, max_vert_start, (min_vert_start+max_vert_start)/2, max_vert_start-min_vert_start);
    483         PRINTF(" - VERT_END   : min = %d / max = %d / med = %d / delta = %d\n",
    484         min_vert_ended, max_vert_ended, (min_vert_ended+max_vert_ended)/2, max_vert_ended-min_vert_ended);
    485 
    486         PRINTF(" - DISP_START : min = %d / max = %d / med = %d / delta = %d\n",
    487         min_disp_start, max_disp_start, (min_disp_start+max_disp_start)/2, max_disp_start-min_disp_start);
    488         PRINTF(" - DISP_END   : min = %d / max = %d / med = %d / delta = %d\n",
    489         min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended)/2, max_disp_ended-min_disp_ended);
    490 
    491         PRINTF(" - BARRIER LOAD/HORI = %d\n", min_hori_start - max_load_ended);
    492         PRINTF(" - BARRIER HORI/VERT = %d\n", min_vert_start - max_hori_ended);
    493         PRINTF(" - BARRIER VERT/DISP = %d\n", min_disp_start - max_vert_ended);
    494 
    495         PRINTF(" - LOAD              = %d\n", max_load_ended);
    496         PRINTF(" - FILTER            = %d\n", max_vert_ended - max_load_ended);
    497         PRINTF(" - DISPLAY           = %d\n", max_disp_ended - max_vert_ended);
    498 
    499         PRINTF("\nBEGIN LOAD_START\n");
    500         for ( cc=0 ; cc<nclusters ; cc++ )
    501         {
    502             for ( pp=0 ; pp<nprocs ; pp++ )
    503             {
    504                 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_START[cc][pp]); 
    505             }
    506         }
    507         PRINTF("END\n");
    508         PRINTF("\nBEGIN LOAD_ENDED\n");
    509         for ( cc=0 ; cc<nclusters ; cc++ )
    510         {
    511             for ( pp=0 ; pp<nprocs ; pp++ )
    512             {
    513                 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_ENDED[cc][pp]); 
    514             }
    515         }
    516         PRINTF("END\n");
    517         PRINTF("\nBEGIN HORI_START\n");
    518         for ( cc=0 ; cc<nclusters ; cc++ )
    519         {
    520             for ( pp=0 ; pp<nprocs ; pp++ )
    521             {
    522                 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_START[cc][pp]); 
    523             }
    524         }
    525         PRINTF("END\n");
    526         PRINTF("\nBEGIN HORI_ENDED\n");
    527         for ( cc=0 ; cc<nclusters ; cc++ )
    528         {
    529             for ( pp=0 ; pp<nprocs ; pp++ )
    530             {
    531                 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_ENDED[cc][pp]); 
    532             }
    533         }
    534         PRINTF("END\n");
    535         PRINTF("\nBEGIN VERT_START\n");
    536         for ( cc=0 ; cc<nclusters ; cc++ )
    537         {
    538             for ( pp=0 ; pp<nprocs ; pp++ )
    539             {
    540                 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_START[cc][pp]); 
    541             }
    542         }
    543         PRINTF("END\n");
    544         PRINTF("\nBEGIN VERT_ENDED\n");
    545         for ( cc=0 ; cc<nclusters ; cc++ )
    546         {
    547             for ( pp=0 ; pp<nprocs ; pp++ )
    548             {
    549                 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_ENDED[cc][pp]); 
    550             }
    551         }
    552         PRINTF("END\n");
    553         PRINTF("\nBEGIN DISP_START\n");
    554         for ( cc=0 ; cc<nclusters ; cc++ )
    555         {
    556             for ( pp=0 ; pp<nprocs ; pp++ )
    557             {
    558                 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_START[cc][pp]); 
    559             }
    560         }
    561         PRINTF("END\n");
    562         PRINTF("\nBEGIN DISP_ENDED\n");
    563         for ( cc=0 ; cc<nclusters ; cc++ )
    564         {
    565             for ( pp=0 ; pp<nprocs ; pp++ )
    566             {
    567                 PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_ENDED[cc][pp]); 
    568             }
    569         }
    570         PRINTF("END\n");
    571     }
    572 
    573     while(1);
     40
     41// Required when initializing an array all at once
     42static void *memcpy(void *_dst, const void *_src, unsigned int size){
     43   unsigned int *dst = _dst;
     44   const unsigned int *src = _src;
     45   if (! ((unsigned int)dst & 3) && ! ((unsigned int)src & 3)){
     46      while (size > 3){
     47         *dst++ = *src++;
     48         size -= 4;
     49      }
     50   }
     51
     52   unsigned char *cdst = (unsigned char*)dst;
     53   unsigned char *csrc = (unsigned char*)src;
     54
     55   while (size--){
     56      *cdst++ = *csrc++;
     57   }
     58   return _dst;
     59}
     60
     61
     62
     63
     64
     65
     66
     67
     68void main(){
     69
     70   //////////////////////////////////
     71   // convolution kernel parameters
     72   // The content of this section is
     73   // Philips proprietary information.
     74   ///////////////////////////////////
     75
     76   int   vnorm  = 115;
     77   int   vf[35] = { 1, 1, 2, 2, 2,
     78                    2, 3, 3, 3, 4,
     79                    4, 4, 4, 5, 5,
     80                    5, 5, 5, 5, 5,
     81                    5, 5, 4, 4, 4,
     82                    4, 3, 3, 3, 2,
     83                    2, 2, 2, 1, 1 };
     84
     85   int hrange = 100;
     86   int hnorm  = 201;
     87
     88   unsigned int date = 0;
     89
     90   int c; // cluster index for loops
     91   int l; // line index for loops
     92   int p; // pixel index for loops
     93   int x; // filter index for loops
     94
     95   const unsigned int proc_id       = procid();                      // processor id
     96   const unsigned int nlocal_procs  = (int) &NB_PROCS;               // number of processors per cluster
     97   const unsigned int nclusters     = (int) &NB_CLUSTERS;            // number of clusters
     98   const unsigned int local_id      = proc_id % nlocal_procs;        // local task id
     99   const unsigned int cluster_id    = proc_id / nlocal_procs;        // cluster task id
     100   const unsigned int base          = (unsigned int) &seg_heap_base; // base address for shared buffers
     101   const unsigned int increment     = 0x80000000 / nclusters * 2;    // cluster increment
     102   const unsigned int nglobal_procs = nclusters * nlocal_procs;      // number of tasks
     103   const unsigned int npixels       = NB_PIXELS;                     // Number of pixel per frame
     104   const unsigned int frame_size    = FRAME_SIZE;                    // Size of 1 frame (in bytes)
     105   const unsigned int * ioc_address = (unsigned int *) &seg_ioc_base;
     106   const unsigned int block_size    = ioc_address[BLOCK_DEVICE_BLOCK_SIZE];
     107   const unsigned int nblocks       = frame_size / block_size;       // number of blocks per frame
     108
     109   const unsigned int lines_per_task     = NL / nglobal_procs; // number of lines per task
     110   const unsigned int lines_per_cluster  = NL / nclusters;     // number of lines per cluster
     111   const unsigned int pixels_per_task    = NP / nglobal_procs; // number of columns per task
     112   const unsigned int pixels_per_cluster = NP / nclusters;     // number of columns per cluster
     113
     114   int first, last;
     115
     116   PRINTF("\n*** Processor %d entering main at cycle %d ***\n\n", proc_id, proctime());
     117
     118   //*(unsigned int *) 0x60000000 = *(unsigned int *) 0x70000000;
     119   //PRINTF("apres acces illegal\n");
     120
     121   /////////////////////////
     122   // parameters checking //
     123   /////////////////////////
     124   
     125
     126   if ((nlocal_procs != 1) && (nlocal_procs != 2) && (nlocal_procs != 4)){
     127      PRINTF("NB_PROCS must be 1, 2 or 4\n");
     128      exit();
     129   }
     130
     131   if ((nclusters !=  4) && (nclusters !=  8) && (nclusters != 16) &&
     132         (nclusters != 32) && (nclusters != 64) && (nclusters !=128) && (nclusters != 256)){
     133      PRINTF("NB_CLUSTERS must be a power of 2 between 4 and 256\n");
     134      exit();
     135   }
     136
     137   if (proc_id >= nglobal_procs){
     138      PRINTF("processor id %d larger than NB_CLUSTERS*NB_PROCS\n", proc_id);
     139      exit();
     140   }
     141
     142   if (NL % nclusters != 0){
     143      PRINTF("NB_CLUSTERS must be a divider of NL");
     144      exit();
     145   }
     146
     147   if (NP % nclusters != 0){
     148      PRINTF("NB_CLUSTERS must be a divider of NP");
     149      exit();
     150   }
     151
     152
     153   // Arrays of pointers on the shared, distributed buffers 
     154   // containing the images (sized for the worst case : 256 clusters)
     155   unsigned short * A[NB_CLUSTER_MAX];
     156   int *            B[NB_CLUSTER_MAX];
     157   int *            C[NB_CLUSTER_MAX];
     158   int *            D[NB_CLUSTER_MAX];
     159   unsigned char *  Z[NB_CLUSTER_MAX];
     160
     161   // Arrays of pointers on the instrumentation arrays
     162   // These arrays are indexed by the cluster index (sized for the worst case : 256 clusters)
     163   // each pointer points on the base adress of an array of 4 (NPROCS max) unsigned int
     164   unsigned int * LOAD_START[NB_CLUSTER_MAX];
     165   unsigned int * LOAD_END[NB_CLUSTER_MAX];
     166   unsigned int * VERT_START[NB_CLUSTER_MAX];
     167   unsigned int * VERT_END[NB_CLUSTER_MAX];
     168   unsigned int * HORI_START[NB_CLUSTER_MAX];
     169   unsigned int * HORI_END[NB_CLUSTER_MAX];
     170   unsigned int * DISP_START[NB_CLUSTER_MAX];
     171   unsigned int * DISP_END[NB_CLUSTER_MAX];
     172
     173   // The shared, distributed buffers addresses are computed
     174   // from the seg_heap_base value defined in the ldscript file
     175   // and from the cluster increment = 4Gbytes/nclusters.
     176   // These arrays of pointers are identical and
     177   // replicated in the stack of each task
     178   for (c = 0; c < nclusters; c++){
     179      unsigned int offset = base + increment * c;
     180      A[c] = (unsigned short *) (offset                             );
     181      B[c] = (int *)            (offset + frame_size * 1 / nclusters); // We increment by 2 * frame_size
     182      C[c] = (int *)            (offset + frame_size * 3 / nclusters); // because sizeof(int) = 2*sizeof(short)
     183      D[c] = (int *)            (offset + frame_size * 5 / nclusters); // so an array of frame_size elements of type
     184      Z[c] = (unsigned char *)  (offset + frame_size * 7 / nclusters); // int can contain the equivalent of 2 frames
     185
     186      offset = base + increment * c + frame_size * 8 / nclusters;
     187      LOAD_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 0);
     188      LOAD_END[c]   = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 1);
     189      VERT_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 2);
     190      VERT_END[c]   = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 3);
     191      HORI_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 4);
     192      HORI_END[c]   = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 5);
     193      DISP_START[c] = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 6);
     194      DISP_END[c]   = (unsigned int *) (offset + nlocal_procs * sizeof(unsigned int) * 7);
     195   }
     196
     197   PRINTF("NB_CLUSTERS     = %d\n", nclusters);
     198   PRINTF("NB_LOCAL_PROCS  = %d\n", nlocal_procs);
     199   PRINTF("NB_GLOBAL_PROCS = %d\n", nglobal_procs);
     200   PRINTF("NB_PIXELS       = %d\n", npixels);
     201   PRINTF("PIXEL_SIZE      = %d\n", PIXEL_SIZE);
     202   PRINTF("FRAME_SIZE      = %d\n", frame_size);
     203   PRINTF("BLOCK_SIZE      = %d\n", block_size);
     204   PRINTF("NB_BLOCKS       = %d\n\n", nblocks);
     205
     206
     207   PRINTF("*** Starting barrier init at cycle %d ***\n", proctime());
     208
     209   //  barriers initialization
     210   barrier_init(0, nglobal_procs);
     211   barrier_init(1, nglobal_procs);
     212   barrier_init(2, nglobal_procs);
     213   barrier_init(3, nglobal_procs);
     214
     215   PRINTF("*** Completing barrier init at cycle %d ***\n", proctime());
     216
     217
     218   ////////////////////////////////////////////////////////
     219   // pseudo parallel load from disk to A[c] buffers
     220   // only task running on processor with (local_id==0) does it
     221   // nblocks/nclusters are loaded in each cluster
     222   ////////////////////////////////////////////////////////
     223
     224   if (local_id == 0){
     225      int p;
     226      date  = proctime();
     227      PRINTF("\n*** Starting load at cycle %d\n", date);
     228      for (p = 0; p < nlocal_procs; p++){
     229         LOAD_START[cluster_id][p] = date;
     230      }
     231
     232      if (ioc_read(nblocks*cluster_id/nclusters, A[cluster_id], nblocks/nclusters)){
     233         PRINTF("echec ioc_read\n");
     234         exit(1);
     235      }
     236      if (ioc_completed()){
     237         PRINTF("echec ioc_completed\n");
     238         exit(1);
     239      }
     240
     241      date  = proctime();
     242      PRINTF("*** Completing load at cycle %d\n", date);
     243      for (p = 0; p < nlocal_procs; p++){
     244         LOAD_END[cluster_id][p] = date;
     245      }
     246   }
     247
     248   barrier_wait(0);
     249
     250
     251   ////////////////////////////////////////////////////////
     252   // parallel horizontal filter :
     253   // B <= transpose(FH(A))
     254   // D <= A - FH(A)
     255   // Each task computes (NL/nglobal_procs) lines
     256   // The image must be extended :
     257   // if (z<0)    TA(cluster_id,l,z) == TA(cluster_id,l,0)
     258   // if (z>NP-1) TA(cluster_id,l,z) == TA(cluster_id,l,NP-1)
     259   ////////////////////////////////////////////////////////
     260
     261   date  = proctime();
     262   PRINTF("\n*** Starting horizontal filter at cycle %d\n", date);
     263   HORI_START[cluster_id][local_id] = date;
     264
     265   // l = absolute line index / p = absolute pixel index 
     266   // first & last define which lines are handled by a given task(cluster_id,local_id)
     267
     268   first = (cluster_id * nlocal_procs + local_id) * lines_per_task;
     269   last  = first + lines_per_task;
     270
     271   for (l = first; l < last; l++){
     272      // src_c and src_l are the cluster index and the line index for A & D
     273      int src_c = l / lines_per_cluster;
     274      int src_l = l % lines_per_cluster;
     275
     276      // We use the specific values of the horizontal ep-filter for optimisation:
     277      // sum(p) = sum(p-1) + TA[p+hrange] - TA[p-hrange-1]
     278      // To minimize the number of tests, the loop on pixels is split in three domains
     279
     280      int sum_p = (hrange + 2) * TA(src_c, src_l, 0);
     281      for (x = 1; x < hrange; x++){
     282         sum_p = sum_p + TA(src_c, src_l, x);
     283      }
     284
     285      // first domain : from 0 to hrange
     286      for (p = 0; p < hrange + 1; p++){
     287         // dst_c and dst_p are the cluster index and the pixel index for B
     288         int dst_c = p / pixels_per_cluster;
     289         int dst_p = p % pixels_per_cluster;
     290         sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, 0);
     291         TB(dst_c, dst_p, l) = sum_p / hnorm;
     292         TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
     293      }
     294      // second domain : from (hrange+1) to (NP-hrange-1)
     295      for (p = hrange + 1; p < NP - hrange; p++){
     296         // dst_c and dst_p are the cluster index and the pixel index for B
     297         int dst_c = p / pixels_per_cluster;
     298         int dst_p = p % pixels_per_cluster;
     299         sum_p = sum_p + (int) TA(src_c, src_l, p + hrange) - (int) TA(src_c, src_l, p - hrange - 1);
     300         TB(dst_c, dst_p, l) = sum_p / hnorm;
     301         TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
     302      }
     303      // third domain : from (NP-hrange) to (NP-1)
     304      for (p = NP - hrange; p < NP; p++){
     305         // dst_c and dst_p are the cluster index and the pixel index for B
     306         int dst_c = p / pixels_per_cluster;
     307         int dst_p = p % pixels_per_cluster;
     308         sum_p = sum_p + (int) TA(src_c, src_l, NP - 1) - (int) TA(src_c, src_l, p - hrange - 1);
     309         TB(dst_c, dst_p, l) = sum_p / hnorm;
     310         TD(src_c, src_l, p) = (int) TA(src_c, src_l, p) - sum_p / hnorm;
     311      }
     312
     313      PRINTF(" - line %d computed at cycle %d\n", l, proctime());
     314   }
     315
     316   date  = proctime();
     317   PRINTF("*** Completing horizontal filter at cycle %d\n", date);
     318   HORI_END[cluster_id][local_id] = date;
     319
     320   barrier_wait(1);
     321
     322
     323   //////////////////////////////////////////////////////////
     324   // parallel vertical filter :
     325   // C <= transpose(FV(B))
     326   // Each task computes (NP/nglobal_procs) columns
     327   // The image must be extended :
     328   // if (l<0)    TB(cluster_id,p,x) == TB(cluster_id,p,0)
     329   // if (l>NL-1)   TB(cluster_id,p,x) == TB(cluster_id,p,NL-1)
     330   //////////////////////////////////////////////////////////
     331
     332   date  = proctime();
     333   PRINTF("\n*** starting vertical filter at cycle %d\n", date);
     334   VERT_START[cluster_id][local_id] = date;
     335
     336   // l = absolute line index / p = absolute pixel index
     337   // first & last define which pixels are handled by a given task(cluster_id,local_id)
     338
     339   first = (cluster_id * nlocal_procs + local_id) * pixels_per_task;
     340   last  = first + pixels_per_task;
     341
     342   for (p = first; p < last; p++){
     343      // src_c and src_p are the cluster index and the pixel index for B
     344      int src_c = p / pixels_per_cluster;
     345      int src_p = p % pixels_per_cluster;
     346
     347      int sum_l;
     348
     349      // We use the specific values of the vertical ep-filter
     350      // To minimize the number of tests, the NL lines are split in three domains
     351
     352      // first domain : explicit computation for the first 18 values
     353      for (l = 0; l < 18; l++){
     354         // dst_c and dst_l are the cluster index and the line index for C
     355         int dst_c = l / lines_per_cluster;
     356         int dst_l = l % lines_per_cluster;
     357
     358         for (x = 0, sum_l = 0; x < 35; x++){
     359            sum_l = sum_l + vf[x] * TB(src_c, src_p, max(l - 17 + x,0) );
     360         }
     361         TC(dst_c, dst_l, p) = sum_l / vnorm;
     362      }
     363      // second domain
     364      for (l = 18; l < NL - 17; l++){
     365         // dst_c and dst_l are the cluster index and the line index for C
     366         int dst_c = l / lines_per_cluster;
     367         int dst_l = l % lines_per_cluster;
     368
     369         sum_l = sum_l + TB(src_c, src_p, l + 4)
     370            + TB(src_c, src_p, l + 8)
     371            + TB(src_c, src_p, l + 11)
     372            + TB(src_c, src_p, l + 15)
     373            + TB(src_c, src_p, l + 17)
     374            - TB(src_c, src_p, l - 5)
     375            - TB(src_c, src_p, l - 9)
     376            - TB(src_c, src_p, l - 12)
     377            - TB(src_c, src_p, l - 16)
     378            - TB(src_c, src_p, l - 18);
     379         TC(dst_c, dst_l, p) = sum_l / vnorm;
     380      }
     381      // third domain
     382      for (l = NL - 17; l < NL; l++){
     383         // dst_c and dst_l are the cluster index and the line index for C
     384         int dst_c = l / lines_per_cluster;
     385         int dst_l = l % lines_per_cluster;
     386
     387         sum_l = sum_l + TB(src_c, src_p, min(l + 4, NL - 1))
     388            + TB(src_c, src_p, min(l + 8, NL - 1))
     389            + TB(src_c, src_p, min(l + 11, NL - 1))
     390            + TB(src_c, src_p, min(l + 15, NL - 1))
     391            + TB(src_c, src_p, min(l + 17, NL - 1))
     392            - TB(src_c, src_p, l - 5)
     393            - TB(src_c, src_p, l - 9)
     394            - TB(src_c, src_p, l - 12)
     395            - TB(src_c, src_p, l - 16)
     396            - TB(src_c, src_p, l - 18);
     397         TC(dst_c, dst_l, p) = sum_l / vnorm;
     398      }
     399      PRINTF(" - column %d computed at cycle %d\n", p, proctime());
     400   }
     401
     402   date  = proctime();
     403   PRINTF("*** Completing vertical filter at cycle %d\n", date);
     404   VERT_END[cluster_id][local_id] = date;
     405
     406   barrier_wait(2);
     407
     408
     409   ////////////////////////////////////////////////////////////////
     410   // final computation and parallel display
     411   // Z <= D + C
     412   // Each processor use its private DMA channel to display
     413   // the resulting image, line  per line (one byte per pixel).
     414   // Eah processor computes & displays (NL/nglobal_procs) lines.
     415   ////////////////////////////////////////////////////////////////
     416
     417   date  = proctime();
     418   PRINTF("\n*** Starting display at cycle %d\n", date);
     419   DISP_START[cluster_id][local_id] = date;
     420
     421   first = local_id * lines_per_task;
     422   last  = first + lines_per_task;
     423
     424   for (l = first; l < last; l++){
     425      for (p = 0; p < NP; p++){
     426         TZ(cluster_id,l,p) = (unsigned char) (((TD(cluster_id,l,p) + TC(cluster_id,l,p)) >> 8) & 0xFF);
     427      }
     428      fb_sync_write(NP * (cluster_id * lines_per_cluster + l), &TZ(cluster_id,l,0), NP);
     429   }
     430
     431#if 0
     432   for (l = first; l < last; l++){
     433      for (p = 0; p < NP; p++){
     434         TA(cluster_id, l, p) = (unsigned char) ((TA(cluster_id, l, p) >> 8) & 0xFF);
     435      }
     436      fb_write(NP * (cluster_id * lines_per_cluster + l), &TA(cluster_id,l,0), NP);
     437   }
     438#endif
     439
     440   date  = proctime();
     441   PRINTF("*** Completing display at cycle %d\n", date);
     442   DISP_END[cluster_id][local_id] = date;
     443
     444   barrier_wait(3);
     445
     446   
     447   /////////////////////////////////////////////////////////
     448   // Instrumentation (done by processor 0 in cluster 0)   
     449   /////////////////////////////////////////////////////////
     450
     451   if (proc_id == 0){
     452      date  = proctime();
     453      PRINTF("\n*** Starting Instrumentation at cycle %d\n\n", date);
     454
     455      int cc, pp;
     456      unsigned int min_load_start = INT_MAX;
     457      unsigned int max_load_start = 0;
     458      unsigned int min_load_ended = INT_MAX;
     459      unsigned int max_load_ended = 0;
     460
     461      unsigned int min_hori_start = INT_MAX;
     462      unsigned int max_hori_start = 0;
     463      unsigned int min_hori_ended = INT_MAX;
     464      unsigned int max_hori_ended = 0;
     465
     466      unsigned int min_vert_start = INT_MAX;
     467      unsigned int max_vert_start = 0;
     468      unsigned int min_vert_ended = INT_MAX;
     469      unsigned int max_vert_ended = 0;
     470
     471      unsigned int min_disp_start = INT_MAX;
     472      unsigned int max_disp_start = 0;
     473      unsigned int min_disp_ended = INT_MAX;
     474      unsigned int max_disp_ended = 0;
     475
     476      for (cc = 0; cc < nclusters; cc++){
     477         for (pp = 0; pp < nlocal_procs; pp++ ){
     478            if (LOAD_START[cc][pp] < min_load_start){
     479               min_load_start = LOAD_START[cc][pp];
     480            }
     481            if (LOAD_START[cc][pp] > max_load_start){
     482               max_load_start = LOAD_START[cc][pp];
     483            }
     484            if (LOAD_END[cc][pp] < min_load_ended){
     485               min_load_ended = LOAD_END[cc][pp];
     486            }
     487            if (LOAD_END[cc][pp] > max_load_ended){
     488               max_load_ended = LOAD_END[cc][pp];
     489            }
     490
     491            if (HORI_START[cc][pp] < min_hori_start){
     492               min_hori_start = HORI_START[cc][pp];
     493            }
     494            if (HORI_START[cc][pp] > max_hori_start){
     495               max_hori_start = HORI_START[cc][pp];
     496            }
     497            if (HORI_END[cc][pp] < min_hori_ended){
     498               min_hori_ended = HORI_END[cc][pp];
     499            }
     500            if (HORI_END[cc][pp] > max_hori_ended){
     501               max_hori_ended = HORI_END[cc][pp];
     502            }
     503
     504            if (VERT_START[cc][pp] < min_vert_start){
     505               min_vert_start = VERT_START[cc][pp];
     506            }
     507            if (VERT_START[cc][pp] > max_vert_start){
     508               max_vert_start = VERT_START[cc][pp];
     509            }
     510            if (VERT_END[cc][pp] < min_vert_ended){
     511               min_vert_ended = VERT_END[cc][pp];
     512            }
     513            if (VERT_END[cc][pp] > max_vert_ended){
     514               max_vert_ended = VERT_END[cc][pp];
     515            }
     516
     517            if (DISP_START[cc][pp] < min_disp_start){
     518               min_disp_start = DISP_START[cc][pp];
     519            }
     520            if (DISP_START[cc][pp] > max_disp_start){
     521               max_disp_start = DISP_START[cc][pp];
     522            }
     523            if (DISP_END[cc][pp] < min_disp_ended){
     524               min_disp_ended = DISP_END[cc][pp];
     525            }
     526            if (DISP_END[cc][pp] > max_disp_ended){
     527               max_disp_ended = DISP_END[cc][pp];
     528            }
     529         }
     530      }
     531      PRINTF(" - LOAD_START : min = %d / max = %d / med = %d / delta = %d\n",
     532            min_load_start, max_load_start, (min_load_start+max_load_start) / 2, max_load_start-min_load_start);
     533      PRINTF(" - LOAD_END   : min = %d / max = %d / med = %d / delta = %d\n",
     534            min_load_ended, max_load_ended, (min_load_ended+max_load_ended) / 2, max_load_ended-min_load_ended);
     535
     536      PRINTF(" - HORI_START : min = %d / max = %d / med = %d / delta = %d\n",
     537            min_hori_start, max_hori_start, (min_hori_start+max_hori_start) / 2, max_hori_start-min_hori_start);
     538      PRINTF(" - HORI_END   : min = %d / max = %d / med = %d / delta = %d\n",
     539            min_hori_ended, max_hori_ended, (min_hori_ended+max_hori_ended) / 2, max_hori_ended-min_hori_ended);
     540
     541      PRINTF(" - VERT_START : min = %d / max = %d / med = %d / delta = %d\n",
     542            min_vert_start, max_vert_start, (min_vert_start+max_vert_start) / 2, max_vert_start-min_vert_start);
     543      PRINTF(" - VERT_END   : min = %d / max = %d / med = %d / delta = %d\n",
     544            min_vert_ended, max_vert_ended, (min_vert_ended+max_vert_ended) / 2, max_vert_ended-min_vert_ended);
     545
     546      PRINTF(" - DISP_START : min = %d / max = %d / med = %d / delta = %d\n",
     547            min_disp_start, max_disp_start, (min_disp_start+max_disp_start) / 2, max_disp_start-min_disp_start);
     548      PRINTF(" - DISP_END   : min = %d / max = %d / med = %d / delta = %d\n",
     549            min_disp_ended, max_disp_ended, (min_disp_ended+max_disp_ended) / 2, max_disp_ended-min_disp_ended);
     550
     551      PRINTF(" - BARRIER LOAD/HORI = %d\n", min_hori_start - max_load_ended);
     552      PRINTF(" - BARRIER HORI/VERT = %d\n", min_vert_start - max_hori_ended);
     553      PRINTF(" - BARRIER VERT/DISP = %d\n", min_disp_start - max_vert_ended);
     554
     555      PRINTF(" - LOAD              = %d\n", max_load_ended);
     556      PRINTF(" - FILTER            = %d\n", max_vert_ended - max_load_ended);
     557      PRINTF(" - DISPLAY           = %d\n", max_disp_ended - max_vert_ended);
     558
     559      PRINTF("\nBEGIN LOAD_START\n");
     560      for (cc = 0; cc < nclusters; cc++){
     561         for (pp = 0; pp < nlocal_procs; pp++){
     562            PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_START[cc][pp]); 
     563         }
     564      }
     565      PRINTF("END\n");
     566
     567      PRINTF("\nBEGIN LOAD_END\n");
     568      for (cc = 0; cc < nclusters; cc++){
     569         for (pp = 0; pp < nlocal_procs; pp++){
     570            PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, LOAD_END[cc][pp]); 
     571         }
     572      }
     573      PRINTF("END\n");
     574
     575      PRINTF("\nBEGIN HORI_START\n");
     576      for (cc = 0; cc < nclusters; cc++){
     577         for (pp = 0; pp < nlocal_procs; pp++){
     578            PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_START[cc][pp]); 
     579         }
     580      }
     581      PRINTF("END\n");
     582
     583      PRINTF("\nBEGIN HORI_END\n");
     584      for (cc = 0; cc < nclusters; cc++){
     585         for (pp = 0; pp < nlocal_procs; pp++){
     586            PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, HORI_END[cc][pp]); 
     587         }
     588      }
     589      PRINTF("END\n");
     590
     591      PRINTF("\nBEGIN VERT_START\n");
     592      for (cc = 0; cc < nclusters; cc++){
     593         for (pp = 0; pp < nlocal_procs; pp++){
     594            PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_START[cc][pp]); 
     595         }
     596      }
     597      PRINTF("END\n");
     598
     599      PRINTF("\nBEGIN VERT_END\n");
     600      for (cc = 0; cc < nclusters; cc++){
     601         for (pp = 0; pp < nlocal_procs; pp++ ){
     602            PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, VERT_END[cc][pp]); 
     603         }
     604      }
     605      PRINTF("END\n");
     606
     607      PRINTF("\nBEGIN DISP_START\n");
     608      for (cc = 0; cc < nclusters; cc++){
     609         for (pp = 0; pp < nlocal_procs; pp++){
     610            PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_START[cc][pp]); 
     611         }
     612      }
     613      PRINTF("END\n");
     614
     615      PRINTF("\nBEGIN DISP_END\n");
     616      for (cc = 0; cc < nclusters; cc++){
     617         for (pp = 0; pp < nlocal_procs; pp++){
     618            PRINTF("cluster= %d proc= %d date= %d\n", cc, pp, DISP_END[cc][pp]); 
     619         }
     620      }
     621      PRINTF("END\n");
     622   }
     623
     624   while(1);
    574625
    575626} // end main()
    576627
     628// Local Variables:
     629// tab-width: 3
     630// c-basic-offset: 3
     631// c-file-offsets:((innamespace . 0)(inline-open . 0))
     632// indent-tabs-mode: nil
     633// End:
     634
     635// vim: filetype=cpp:expandtab:shiftwidth=3:tabstop=3:softtabstop=3
     636
     637
  • trunk/softs/soft_transpose_giet/main.c

    r244 r248  
    99#define NB_CLUSTER_MAX  256
    1010
    11 #define PRINTF(...)      ({ if (local_id == 0) { tty_printf(__VA_ARGS__); } })
     11#define PRINTF(...)      ({ if (proc_id == 0) { tty_printf(__VA_ARGS__); } })
     12
     13//#define DISPLAY_ONLY
    1214
    1315///////////////////////////////////////////
     
    104106
    105107
    106    PRINTF("*** starting barrier init at cycle %d ***\n", proctime());
     108   PRINTF("*** Starting barrier init at cycle %d ***\n", proctime());
    107109
    108110   //  barriers initialization
     
    111113   barrier_init(2, nglobal_procs);
    112114
    113    PRINTF("*** completing barrier init at cycle %d ***\n", proctime());
     115   PRINTF("*** Completing barrier init at cycle %d ***\n", proctime());
    114116
    115117   // Main loop (on frames)
     
    127129            LOAD_START[cluster_id][p] = date;
    128130         }
    129          tty_printf("    block_device offset : %d\n", nblocks * cluster_id / nclusters);
    130131         if (ioc_read(frame * nblocks + nblocks * cluster_id / nclusters, A[cluster_id], nblocks / nclusters)){
    131             tty_printf("echec ioc_read\n");
     132            PRINTF("echec ioc_read\n");
    132133            exit();
    133134         }
    134135         if (ioc_completed()){
    135             tty_printf("echec ioc_completed\n");
     136            PRINTF("echec ioc_completed\n");
    136137            exit();
    137138         }
     
    150151      // (p,l) are the (x,y) pixel coordinates in the source frame
    151152
     153#ifndef DISPLAY_ONLY
    152154      date = proctime();
    153155      PRINTF("\n*** Starting transpose for frame %d at cycle %d\n", frame, date);
     
    173175      PRINTF("*** Completing transpose for frame %d at cycle %d\n", frame, date);
    174176      TRSP_END[cluster_id][local_id] = date;
    175 
    176177      barrier_wait(1);
     178#endif
    177179
    178180      // parallel display from B[c] to frame buffer
     
    184186
    185187      unsigned int npxt = npixels / nglobal_procs;   // number of pixels per proc
    186       if (npixels - npxt * nglobal_procs != 0){
    187          tty_printf("*** Error line %d\n", __LINE__);
     188
     189#ifndef DISPLAY_ONLY
     190      if (fb_write(npxt * proc_id, B[cluster_id] + npxt * local_id, npxt)){
     191         PRINTF("[%d]: echec fb_sync_write\n", proc_id);
    188192         exit();
    189193      }
    190       tty_printf("    npxt : %d\n", npxt);
    191 
    192       if (fb_write(npxt * proc_id, B[cluster_id] + npxt * local_id, npxt)){
    193          tty_printf("[%d]: echec fb_sync_write\n", proc_id);
     194#else
     195      if (fb_write(npxt * proc_id, A[cluster_id] + npxt * local_id, npxt)){
     196         PRINTF("[%d]: echec fb_sync_write\n", proc_id);
    194197         exit();
    195198      }
    196      
    197       PRINTF("    After fb_write and before fb_completed\n");
     199#endif
    198200
    199201      if (fb_completed()){
    200          tty_printf("[%d]: echec fb_completed\n", proc_id);
     202         PRINTF("[%d]: echec fb_completed\n", proc_id);
    201203         exit();
    202204      }
Note: See TracChangeset for help on using the changeset viewer.