Context Navigation

← Previous Changeset
Next Changeset →

Changeset 159

Timestamp:

May 9, 2011, 6:13:44 PM (13 years ago)

Author:

alain

Message:

code optimisation for the vertical filter.

File:

: 1 edited

trunk/softs/soft_filter_giet/main.c (modified) (11 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/softs/soft_filter_giet/main.c

-                      r158
+                      r159
 ///////////////////////////////////
-    int vrange = 17;
     int vnorm  = 115;
     int vf[35];
 …
     //////////////////////////////////////////////////////////
     // parallel horizontal filter :
     //  B <= transpose(FH(A))
     //  D <= A - FH(A)
+    // B <= transpose(FH(A))
+    // D <= A - FH(A)
     // Each task computes (NL/ntasks) lines
     // The image must be extended :
 …
         // To minimize the number of tests, the loop on pixels is split in three domains
         int sum = (hrange+2)*TA(src_c, src_l, 0);
         for ( x = 1 ; x < hrange ; x++) sum = sum + TA(src_c, src_l, x);
+        int sum_p = (hrange+2)*TA(src_c, src_l, 0);
+        for ( x = 1 ; x < hrange ; x++) sum_p = sum_p + TA(src_c, src_l, x);
         // first domain : from 0 to hrange
 …
             int dst_c = p/pixels_per_cluster;
             int dst_p = p%pixels_per_cluster;
             sum = sum + (int)TA(src_c, src_l, p+hrange) - (int)TA(src_c, src_l, 0);
             TB(dst_c, dst_p, l) = sum/hnorm;
             TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum/hnorm;
+            sum_p = sum_p + (int)TA(src_c, src_l, p+hrange) - (int)TA(src_c, src_l, 0);
+            TB(dst_c, dst_p, l) = sum_p/hnorm;
+            TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm;
+        }
         // second domain : from (hrange+1) to (NP-hrange-1)
 …
             int dst_c = p/pixels_per_cluster;
             int dst_p = p%pixels_per_cluster;
             sum = sum + (int)TA(src_c, src_l, p+hrange) - (int)TA(src_c, src_l, p-hrange-1);
             TB(dst_c, dst_p, l) = sum/hnorm;
             TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum/hnorm;
+            sum_p = sum_p + (int)TA(src_c, src_l, p+hrange) - (int)TA(src_c, src_l, p-hrange-1);
+            TB(dst_c, dst_p, l) = sum_p/hnorm;
+            TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm;
+        }
         // third domain : from (NP-hrange) to (NP-1)
 …
             int dst_c = p/pixels_per_cluster;
             int dst_p = p%pixels_per_cluster;
             sum = sum + (int)TA(src_c, src_l, NP-1) - (int)TA(src_c, src_l, p-hrange-1);
             TB(dst_c, dst_p, l) = sum/hnorm;
             TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum/hnorm;
+            sum_p = sum_p + (int)TA(src_c, src_l, NP-1) - (int)TA(src_c, src_l, p-hrange-1);
+            TB(dst_c, dst_p, l) = sum_p/hnorm;
+            TD(src_c, src_l, p) = (int)TA(src_c, src_l, p) - sum_p/hnorm;
+        }
 …
         int src_p = p%pixels_per_cluster;
+        for ( l=0 ; l<NL ; l++ )
+        int sum_l;
+        // We use the specific values of the vertical ep-filter
+        // To minimize the number of tests, the NL lines are split in three domains
+        // first domain : explicit computation for the first 18 values
+        for ( l=0 ; l<18 ; l++)
+        {
             // dst_c and dst_l are the cluster index and the line index for C
 …
             int dst_l = l%lines_per_cluster;
+            int sum = 0;
+            for ( x=0 ; x<(2*vrange + 1) ; x++ )
+            for ( x=0, sum_l=0 ; x<35 ; x++ )
+            {
+                int     z;
+                if      ( (l-vrange+x) < 0 )            z = 0;
+                else if ( (l-vrange+x) > (NL-1) )       z = NL-1;
+                else                                    z = l-vrange+x;
+                sum = sum + vf[x]*TB(src_c, src_p, z);
+                sum_l = sum_l + vf[x] * TB(src_c, src_p, max(l-17+x,0) );
+            }
+            TC(dst_c, dst_l, p) = sum/vnorm;
+        }
+/**********************************************************************************
+        // We use the specific values of the vertical ep-filter
+        // To minimize the number of tests, the NL lines are split in three domains
+        int sum = 0;
+        // first domain
+        for ( l = 0 ; l < vrange ; l++)
+            TC(dst_c, dst_l, p) = sum_l/vnorm;
+        }
+        // second domain
+        for ( l = 18 ; l < NL-17 ; l++ )
+        {
             // dst_c and dst_l are the cluster index and the line index for C
 …
             int dst_l = l%lines_per_cluster;
+            for ( x = 0 ; x < (2*vrange+1) ; x++ )
+            {
+                sum = sum + vf[x] * TB(src_c, src_p, max(l-vrange+x,0));
+            }
+            TC(dst_c, dst_l, p) = sum/vnorm;
+        }
+        // second domain
+        for ( l = vrange ; l < NL-vrange ; l++ )
+            sum_l = sum_l + TB(src_c, src_p, l+4)
+                          + TB(src_c, src_p, l+8)
+                          + TB(src_c, src_p, l+11)
+                          + TB(src_c, src_p, l+15)
+                          + TB(src_c, src_p, l+17)
+                          - TB(src_c, src_p, l-5)
+                          - TB(src_c, src_p, l-9)
+                          - TB(src_c, src_p, l-12)
+                          - TB(src_c, src_p, l-16)
+                          - TB(src_c, src_p, l-18);
+            TC(dst_c, dst_l, p) = sum_l/vnorm;
+        }
+        // third domain
+        for ( l = NL-17 ; l < NL ; l++ )
+        {
             // dst_c and dst_l are the cluster index and the line index for C
 …
             int dst_l = l%lines_per_cluster;
+            sum = sum + TB(src_c, src_p, l+4)
+                      + TB(src_c, src_p, l+8)
+                      + TB(src_c, src_p, l+11)
+                      + TB(src_c, src_p, l+15)
+                      + TB(src_c, src_p, l+17)
+                      - TB(src_c, src_p, l-5)
+                      - TB(src_c, src_p, l-9)
+                      - TB(src_c, src_p, l-12)
+                      - TB(src_c, src_p, l-16)
+                      - TB(src_c, src_p, max(l-18,0));
+            TC(dst_c, dst_l, p) = sum/vnorm;
+        }
+        // third domain
+        for ( l = NL-vrange ; l < NL ; l++ )
+        {
+            // dst_c and dst_l are the cluster index and the line index for C
+            int dst_c = l/lines_per_cluster;
+            int dst_l = l%lines_per_cluster;
+            sum = sum + TB(src_c, src_p, min(l+5,NL-1))
+                      + TB(src_c, src_p, min(l+9,NL-1))
+                      + TB(src_c, src_p, min(l+12,NL-1))
+                      + TB(src_c, src_p, min(l+16,NL-1))
+                      + TB(src_c, src_p, min(l+18,NL-1))
+                      - TB(src_c, src_p, l-4)
+                      - TB(src_c, src_p, l-8)
+                      - TB(src_c, src_p, l-11)
+                      - TB(src_c, src_p, l-15)
+                      - TB(src_c, src_p, l-17);
+            TC(dst_c, dst_l, p) = sum/vnorm;
+        }
+*****************************************************************************/
+            sum_l = sum_l + TB(src_c, src_p, min(l+4,NL-1))
+                          + TB(src_c, src_p, min(l+8,NL-1))
+                          + TB(src_c, src_p, min(l+11,NL-1))
+                          + TB(src_c, src_p, min(l+15,NL-1))
+                          + TB(src_c, src_p, min(l+17,NL-1))
+                          - TB(src_c, src_p, l-5)
+                          - TB(src_c, src_p, l-9)
+                          - TB(src_c, src_p, l-12)
+                          - TB(src_c, src_p, l-16)
+                          - TB(src_c, src_p, l-18);
+            TC(dst_c, dst_l, p) = sum_l/vnorm;
+        }
         PRINTF(" - column %d computed at cycle %d\n", p, proctime());
+    }
 …
     barrier_wait(2);
     ////////////////////////////////////////////////////////////////////////////
     // final computation and parallel display using the distributed DMA
     // D <= D + C
+    ////////////////////////////////////////////////////////////////
+    // final computation and parallel display
+    // Z <= D + C
     // Each processor use its private DMA channel to display
     // the resulting image, line  per line (one byte per pixel).

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 159

Legend:

trunk/softs/soft_filter_giet/main.c

Download in other formats: