Changeset 637 for trunk/user/fft


Ignore:
Timestamp:
Jul 18, 2019, 2:06:55 PM (5 years ago)
Author:
alain
Message:

Introduce the non-standard pthread_parallel_create() system call
and re-write the <fft> and <sort> applications to improve the
intrinsic paralelism in applications.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/user/fft/fft.c

    r636 r637  
    2222// of N complex points, using the Cooley-Tuckey FFT method.
    2323// The N data points are seen as a 2D array (rootN rows * rootN columns).
    24 // Each thread handle (rootN / nthreads) rows. The N input data points
    25 // be initialised in three different modes:
     24// Each thread handle (rootN / nthreads) rows.
     25// The N input data points can be initialised in three different modes:
    2626// - CONSTANT : all data points have the same [1,0] value
    2727// - COSIN    : data point n has [cos(n/N) , sin(n/N)] values
     
    3131//  - M : N = 2**M = number of data points / M must be an even number.
    3232//  - T : nthreads = ncores defined by the hardware / must be power of 2.
     33// The number of threads cannot be larger than the number of rows.
    3334//
    34 // This application uses 4 shared data arrays, that are dynamically
    35 // allocated an distributed, using the remote_malloc() function, with
    36 // one sub-buffer per cluster:
    37 // - data[N] contains N input data points, with 2 double per point.
    38 // - trans[N] contains N intermediate data points, 2 double per point.
    39 // - umain[rootN] contains rootN coefs required for a rootN points FFT.
    40 // - twid[N] contains N coefs : exp(2*pi*i*j/N) / i and j in [0,rootN-1].
    41 // For data, trans, twid, each sub-buffer contains (N/nclusters) points.
    42 // For umain, each sub-buffer contains (rootN/nclusters) points.
     35// This application uses 3 shared data arrays, that are dynamically
     36// allocated and distributed in clusters, with one sub-buffer per cluster:
     37// - data[N] contains N input data points,
     38// - trans[N] contains N intermediate data points,
     39// - twid[N] contains N coefs : exp(2*pi*i*j/N) / i and j in [0,rootN-1]
     40// Each sub-buffer contains (N/nclusters) entries, with 2 double per entry.
     41// These distributed buffers are allocated and initialised in parallel
     42// by the working threads running on core 0 in each cluster.
    4343//
    44 // There is one thread per core.
    45 // The max number of clusters is defined by (X_MAX * Y_MAX).
    46 // The max number of cores per cluster is defined by CORES_MAX.
     44// Each working thread allocates also a private coefs[rootN-1] buffer,
     45// that contains all coefs required for a rootN points FFT.
     46//
     47// There is one working thread per core.
     48// The actual number of cores and cluster in a given hardware architecture
     49// is obtained by the get_config() syscall (x_size, y_size, ncores).
     50// The max number of clusters is bounded by (X_MAX * Y_MAX).
     51// The max number of cores per cluster is bounded by CORES_MAX.
    4752//
    4853// Several configuration parameters can be defined below:
     
    5762//   by the main thread in the main() function.
    5863// - The parallel execution time (parallel_time[i]) is computed by each
    59 //   thread(i) in the slave() function.
     64//   working thread(i) in the work() function.
    6065// - The synchronisation time related to the barriers (sync_time[i])
    61 //   is computed by each thread(i) in the slave() function.
     66//   is computed by each thread(i) in the work() function.
    6267// The results are displayed on the TXT terminal, and registered on disk.
    6368///////////////////////////////////////////////////////////////////////////
     
    8792// parameters
    8893
    89 #define DEFAULT_M               12              // 4096 data points
    90 #define USE_DQT_BARRIER         0               // use DDT barrier if non zero
     94#define DEFAULT_M               14              // 16384 data points
     95#define USE_DQT_BARRIER         1               // use DDT barrier if non zero
    9196#define MODE                    COSIN           // DATA array initialisation mode
    9297#define CHECK                   0               
    93 #define DEBUG_MAIN              0               // trace main() function (detailed if odd)
    94 #define DEBUG_SLAVE             0               // trace slave() function (detailed if odd)
     98#define DEBUG_MAIN              1               // trace main() function (detailed if odd)
     99#define DEBUG_WORK              1               // trace work() function (detailed if odd)
    95100#define DEBUG_FFT1D             0               // trace FFT1D() function (detailed if odd)
    96101#define DEBUG_ROW               0               // trace FFTRow() function (detailed if odd)
     
    101106
    102107/////////////////////////////////////////////////////////////////////////////////////
    103 //             structure containing the arguments for the slave() function
     108//             FFT specific global variables
    104109/////////////////////////////////////////////////////////////////////////////////////
    105110
    106 typedef struct args_s
    107 {
    108     unsigned int   tid;                    // thread continuous index
    109     unsigned int   main_tid;               // main thread continuous index
     111// work function arguments
     112typedef struct work_args_s
     113{
     114    unsigned int        tid;               // thread continuous index
     115    unsigned int        lid;               // core local index
     116    unsigned int        cid;               // cluster continuous index
     117    pthread_barrier_t * parent_barrier;    // parent barrier to signal completion
    110118}
    111 args_t;
    112 
    113 /////////////////////////////////////////////////////////////////////////////////////
    114 //             global variables
    115 /////////////////////////////////////////////////////////////////////////////////////
    116 
    117 unsigned int   x_size;                     // number of clusters per row in the mesh
    118 unsigned int   y_size;                     // number of clusters per column in the mesh
    119 unsigned int   ncores;                     // number of cores per cluster
     119work_args_t;
     120
    120121unsigned int   nthreads;                   // total number of threads (one thread per core)
    121122unsigned int   nclusters;                  // total number of clusters
     
    129130double *       data[CLUSTERS_MAX];         // original time-domain data
    130131double *       trans[CLUSTERS_MAX];        // used as auxiliary space for transpose
     132double *       twid[CLUSTERS_MAX];         // twiddle factor : exp(-2iPI*k*n/N)
    131133double *       bloup[CLUSTERS_MAX];        // used as auxiliary space for DFT
    132 double *       umain[CLUSTERS_MAX];        // roots of unity used fo rootN points FFT   
    133 double *       twid[CLUSTERS_MAX];         // twiddle factor : exp(-2iPI*k*n/N)
    134134
    135135// instrumentation counters
     
    142142pthread_barrierattr_t  barrier_attr;
    143143
    144 // threads identifiers, attributes, and arguments
    145 pthread_t       trdid[THREADS_MAX];        // kernel threads identifiers
    146 pthread_attr_t  attr[THREADS_MAX];         // POSIX thread attributes
    147 args_t          args[THREADS_MAX];         // slave function arguments
    148 
    149 /////////////////////////////////////////////////////////////////////////////////
     144/////////////////////////////////////////////////////////////////////////////////////
     145//             Global variables required by parallel_pthread_create()
     146/////////////////////////////////////////////////////////////////////////////////////
     147
     148// 2D arrays of input arguments for the <work> threads
     149// These arrays are initialised by the application main thread
     150
     151work_args_t       work_args[CLUSTERS_MAX][CORES_MAX];  // work function arguments
     152work_args_t     * work_ptrs[CLUSTERS_MAX][CORES_MAX];  // pointers on arguments
     153
     154// 1D array of barriers to allow the <work> threads to signal termination
     155// this array is initialised in each cluster by the <build[cxy][0]> thread
     156 
     157pthread_barrier_t parent_barriers[CLUSTERS_MAX];        // termination barrier
     158
     159/////////////////////////////////////////////////////////////////////////////////////
    150160//           functions declaration
    151 /////////////////////////////////////////////////////////////////////////////////
    152 
    153 void slave( args_t * args );
     161/////////////////////////////////////////////////////////////////////////////////////
     162
     163void work( work_args_t * args );
    154164
    155165double CheckSum( void );
    156166
    157 void InitX(double ** x , unsigned int mode);
    158 
    159 void InitU(double ** u);
    160 
    161 void InitT(double ** u);
     167void InitD( double    ** data ,
     168            unsigned int mode,
     169            unsigned int tid );
     170
     171void InitT( double    ** twid,
     172            unsigned int tid );
     173
     174void InitU( double * coefs );
    162175
    163176unsigned int BitReverse( unsigned int k );
     
    168181            double     * upriv,
    169182            double    ** twid,
    170             unsigned int MyNum,
     183            unsigned int tid,
    171184            unsigned int MyFirst,
    172185            unsigned int MyLast );
     
    217230    int                 error;
    218231
    219     unsigned int        main_cxy;          // main thread cluster
    220     unsigned int        main_x;            // main thread X coordinate
    221     unsigned int        main_y;            // main thread y coordinate
    222     unsigned int        main_lid;          // main thread local core index
    223     unsigned int        main_tid;          // main thread continuous index
     232    unsigned int        x_size;            // number of clusters per row
     233    unsigned int        y_size;            // number of clusters per column
     234    unsigned int        ncores;            // max number of cores per cluster
    224235
    225236    unsigned int        x;                 // current index for cluster X coordinate
    226237    unsigned int        y;                 // current index for cluster Y coordinate
    227238    unsigned int        lid;               // current index for core in a cluster
    228     unsigned int        ci;                // continuous cluster index (from x,y)
     239    unsigned int        tid;               // continuous thread index
     240    unsigned int        cid;               // cluster continuous index
    229241    unsigned int        cxy;               // hardware specific cluster identifier
    230     unsigned int        tid;               // continuous thread index
     242
     243    char                name[64];          // instrumentation file name
     244    char                path[128];         // instrumentation path name
     245    char                string[256];
     246    int                 ret;
    231247
    232248    unsigned long long  start_init_cycle;
    233249    unsigned long long  end_init_cycle;
    234250
     251#if DEBUG_MAIN
     252    unsigned long long  debug_cycle;
     253#endif
     254
    235255#if CHECK
    236 double     ck1;           // for input/output checking
    237 double     ck3;           // for input/output checking
     256    double              ck1;               // for input/output checking
     257    double              ck3;               // for input/output checking
    238258#endif
    239259   
     
    241261    get_cycle( &start_init_cycle );
    242262
    243     // get platform parameters to compute nthreads & nclusters
     263    // get platform parameters
    244264    if( get_config( &x_size , &y_size , &ncores ) )
    245265    {
     
    269289    }
    270290
     291    // compute nthreads and nclusters
    271292    nthreads  = x_size * y_size * ncores;
    272293    nclusters = x_size * y_size;
     294
     295    // compute covering DQT size an level
     296    unsigned int z = (x_size > y_size) ? x_size : y_size;
     297    unsigned int root_level = (z == 1) ? 0 : (z == 2) ? 1 : (z == 4) ? 2 : (z == 8) ? 3 : 4;
    273298
    274299    // compute various constants depending on N and T
     
    285310    }
    286311
    287     // get main thread coordinates (main_x, main_y, main_lid)
    288     get_core( &main_cxy , &main_lid );
    289     main_x   = HAL_X_FROM_CXY( main_cxy );
    290     main_y   = HAL_Y_FROM_CXY( main_cxy );
    291     main_tid = (((main_x * y_size) + main_y) * ncores) + main_lid;
    292 
    293     printf("\n[fft] starts / core[%x,%d] / %d points / %d thread(s) / PID %x / cycle %d\n",
    294     main_cxy, main_lid, N, nthreads, getpid(), (unsigned int)start_init_cycle );
    295 
    296     // allocate memory for the distributed data[i], trans[i], umain[i], twid[i] buffers
    297     // the index (i) is a continuous cluster index
    298     unsigned int data_size   = (N / nclusters) * 2 * sizeof(double);
    299     unsigned int coefs_size  = (rootN / nclusters) * 2 * sizeof(double);
    300     for (x = 0 ; x < x_size ; x++)
    301     {
    302         for (y = 0 ; y < y_size ; y++)
    303         {
    304             ci         = x * y_size + y;
    305             cxy        = HAL_CXY_FROM_XY( x , y );
    306             data[ci]   = (double *)remote_malloc( data_size  , cxy );
    307             trans[ci]  = (double *)remote_malloc( data_size  , cxy );
    308             bloup[ci]  = (double *)remote_malloc( data_size  , cxy );
    309             umain[ci]  = (double *)remote_malloc( coefs_size , cxy );
    310             twid[ci]   = (double *)remote_malloc( data_size  , cxy );
    311         }
     312    printf("\n[fft] starts / %d points / %d thread(s) / PID %x / cycle %d\n",
     313    N, nthreads, getpid(), (unsigned int)start_init_cycle );
     314
     315    // build instrumentation file name
     316    if( USE_DQT_BARRIER )
     317    snprintf( name , 64 , "p_fft_dqt_%d_%d_%d", N , x_size * y_size , ncores );
     318    else
     319    snprintf( name , 64 , "p_fft_smp_%d_%d_%d", N , x_size * y_size , ncores );
     320
     321    // build pathname
     322    snprintf( path , 128 , "/home/%s", name );
     323
     324    // open instrumentation file
     325    FILE * f = fopen( path , NULL );
     326    if ( f == NULL )
     327    {
     328        printf("\n[fft error] cannot open instrumentation file <%s>\n", path );
     329        exit( 0 );
    312330    }
    313331
    314332#if DEBUG_MAIN
    315 printf("\n[fft] main completes remote_malloc\n");
    316 #endif
    317 
    318     // arrays initialisation
    319     InitX( data , MODE );
    320     InitU( umain );
    321     InitT( twid );
    322 
    323 #if DEBUG_MAIN
    324 printf("\n[fft] main completes arrays init\n");
     333get_cycle( &debug_cycle );
     334printf("\n[fft] main open file <%s> at cycle %d\n",
     335path, (unsigned int)debug_cycle );
    325336#endif
    326337
     
    342353#endif
    343354
    344     // initialise barrier
     355    // initialise barrier synchronizing all <work> threads
    345356    if( USE_DQT_BARRIER )
    346357    {
     
    362373
    363374#if DEBUG_MAIN
    364 printf("\n[fft] main completes barrier init\n");
    365 #endif
    366 
    367     // launch other threads to execute the slave() function
    368     // on cores other than the core running the main thread
     375get_cycle( &debug_cycle );
     376printf("\n[fft] main completes barrier init at cycle %d\n",
     377(unsigned int)debug_cycle );
     378#endif
     379
     380    // build array of arguments for the <work> threads
    369381    for (x = 0 ; x < x_size ; x++)
    370382    {
     
    376388            for ( lid = 0 ; lid < ncores ; lid++ )
    377389            {
    378                 // compute thread user index (continuous index)
    379                 tid = (((x * y_size) + y) * ncores) + lid;
    380 
    381                 // set thread attributes
    382                 attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED;
    383                 attr[tid].cxy        = cxy;
    384                 attr[tid].lid        = lid;
    385 
    386                 // set slave function argument
    387                 args[tid].tid      = tid;
    388                 args[tid].main_tid = main_tid;
    389 
    390                 // create thread
    391                 if( tid != main_tid )
    392                 {
    393                     if ( pthread_create( &trdid[tid],  // pointer on kernel identifier
    394                                          &attr[tid],   // pointer on thread attributes
    395                                          &slave,       // pointer on function
    396                                          &args[tid]) ) // pointer on function arguments
    397                     {
    398                         printf("\n[fft error] creating thread %x\n", tid );
    399                         exit( 0 );
    400                     }
    401 
    402 #if (DEBUG_MAIN & 1)
    403 unsigned long long debug_cycle;
    404 get_cycle( &debug_cycle );
    405 printf("\n[fft] main created thread %d on core[%x,%d] / cycle %d\n",
    406 tid, cxy, lid, (unsigned int)debug_cycle );
    407 #endif
    408                 }
     390                // compute cluster continuous index
     391                cid = (x * y_size) + y;
     392
     393                // compute work thread continuous index
     394                tid = (cid * ncores) + lid;
     395               
     396                // initialize 2D array of arguments
     397                work_args[cxy][lid].tid            = tid;
     398                work_args[cxy][lid].lid            = lid;
     399                work_args[cxy][lid].cid            = cid;
     400                work_args[cxy][lid].parent_barrier = &parent_barriers[cxy];
     401
     402                // initialize 2D array of pointers
     403                work_ptrs[cxy][lid] = &work_args[cxy][lid];
    409404            }
    410405        }
    411406    }
    412407
     408    // register sequencial time
     409    get_cycle( &end_init_cycle );
     410    init_time = (unsigned int)(end_init_cycle - start_init_cycle);
     411
    413412#if DEBUG_MAIN
    414 printf("\n[fft] main completes threads creation\n");
    415 #endif
    416 
    417     get_cycle( &end_init_cycle );
    418 
    419     // register sequencial time
    420     init_time = (unsigned int)(end_init_cycle - start_init_cycle);
    421    
    422     // main itself executes the slave() function
    423     slave( &args[main_tid] );
    424 
    425     // wait other threads completion
    426     for (x = 0 ; x < x_size ; x++)
    427     {
    428         for (y = 0 ; y < y_size ; y++)
    429         {
    430             for ( lid = 0 ; lid < ncores ; lid++ )
    431             {
    432                 // compute thread continuous index
    433                 tid = (((x * y_size) + y) * ncores) + lid;
    434 
    435                 if( tid != main_tid )
    436                 {
    437                     if( pthread_join( trdid[tid] , NULL ) )
    438                     {
    439                         printf("\n[fft error] in main thread joining thread %x\n", tid );
    440                         exit( 0 );
    441                     }
    442                    
    443 #if (DEBUG_MAIN & 1)
    444 printf("\n[fft] main thread %d joined thread %d\n", main_tid, tid );
    445 #endif
    446 
    447                 }
    448             }
    449         }
    450     }
     413printf("\n[fft] main completes <work> threads arguments at cycle %d\n",
     414(unsigned int)end_init_cycle );
     415#endif
     416
     417    // create and execute the working threads
     418    if( pthread_parallel_create( root_level,
     419                                 &work,
     420                                 &work_ptrs[0][0],
     421                                 &parent_barriers[0] ) )
     422    {
     423        printf("\n[fft error] creating threads\n");
     424        exit( 0 );
     425    }
     426
     427#if DEBUG_MAIN
     428get_cycle( &debug_cycle );
     429printf("\n[fft] main resume for instrumentation at cycle %d\n",
     430(unsigned int)debug_cycle) ;
     431#endif
    451432
    452433#if PRINT_ARRAY
     
    463444#endif
    464445
    465     // instrumentation
    466     char name[64];
    467     char path[128];
    468     char string[256];
    469     int  ret;
    470 
    471     // build file name
    472     if( USE_DQT_BARRIER )
    473     snprintf( name , 64 , "fft_dqt_%d_%d_%d", N , x_size * y_size , ncores );
    474     else
    475     snprintf( name , 64 , "fft_smp_%d_%d_%d", N , x_size * y_size , ncores );
    476 
    477     // build pathname
    478     snprintf( path , 128 , "/home/%s", name );
    479 
    480     // open instrumentation file
    481     FILE * f = fopen( path , NULL );
    482     if ( f == NULL )
    483     {
    484         printf("\n[fft error] cannot open instrumentation file <%s>\n", path );
    485         exit( 0 );
    486     }
    487     printf("\n[fft] file <%s> open\n", path );
    488 
    489446    // display header on terminal, and save to file
    490447    printf("\n----- %s -----\n", name );
     
    497454    }
    498455
    499     // display results for each thread on terminal, and save to file
     456    // get instrumentation results for each thread
    500457    for (tid = 0 ; tid < nthreads ; tid++)
    501458    {
     
    503460        tid, init_time, parallel_time[tid], sync_time[tid] );
    504461
    505         // display on terminal, and save to instrumentation file
    506         printf("%s" , string );
     462        // save  to instrumentation file
    507463        fprintf( f , "%s" , string );
    508464        if( ret < 0 )
    509465        {
    510466            printf("\n[fft error] cannot write thread %d to file <%s>\n", tid, path );
     467            printf("%s", string );
    511468            exit(0);
    512469        }
    513470    }
    514471
    515     // display MIN/MAX values on terminal and save to file
     472    // compute min/max values
    516473    unsigned int min_para = parallel_time[0];
    517474    unsigned int max_para = parallel_time[0];
     
    527484    }
    528485
     486    // display MIN/MAX values on terminal and save to file
    529487    snprintf( string , 256 , "\n      Sequencial  Parallel       Barrier\n"
    530488                             "MIN : %d\t | %d\t | %d\t   (cycles)\n"
     
    547505        exit(0);
    548506    }
    549     printf("\n[fft] file <%s> closed\n", path );
     507 
     508#if DEBUG_MAIN
     509get_cycle( &debug_cycle );
     510printf("\n[fft] main close file <%s> at cycle %d\n",
     511path, (unsigned int)debug_cycle );
     512#endif
    550513
    551514    exit( 0 );
     
    553516} // end main()
    554517
    555 ///////////////////////////////////////////////////////////////
    556 // This function is executed in parallel by all threads.
    557 ///////////////////////////////////////////////////////////////
    558 void slave( args_t * args )
    559 {
    560     unsigned int   i;
    561     unsigned int   MyNum;           // this thread index
    562     unsigned int   MainNum;         // main thread index
    563     unsigned int   MyFirst;         // index first row allocated to thread
    564     unsigned int   MyLast;          // index last row allocated to thread
    565     double       * upriv;
    566     unsigned int   c_id;
    567     unsigned int   c_offset;
     518/////////////////////////////////////////////////////////////////
     519// This function is executed in parallel by all <work> threads.
     520/////////////////////////////////////////////////////////////////
     521void work( work_args_t * args )
     522{
     523    unsigned int        tid;              // this thread continuous index
     524    unsigned int        lid;              // core local index
     525    unsigned int        cid;              // cluster continuous index
     526    pthread_barrier_t * parent_barrier;   // pointer on parent barrier
     527
     528    unsigned int        MyFirst;          // index first row allocated to thread
     529    unsigned int        MyLast;           // index last row allocated to thread
     530    double            * upriv;            // private array of FFT coefs
    568531
    569532    unsigned long long  parallel_start;
     
    572535    unsigned long long  barrier_stop;
    573536
    574     MyNum   = args->tid;
    575     MainNum = args->main_tid;
     537    // get thread arguments
     538    tid            = args->tid;
     539    lid            = args->lid;             
     540    cid            = args->cid;             
     541    parent_barrier = args->parent_barrier;
    576542
    577543    get_cycle( &parallel_start );
    578544
    579 #if DEBUG_SLAVE
     545#if DEBUG_WORK
    580546printf("\n[fft] %s : thread %d enter / cycle %d\n",
    581 __FUNCTION__, MyNum, (unsigned int)parallel_start );
    582 #endif
     547__FUNCTION__, tid, (unsigned int)parallel_start );
     548#endif
     549
     550    // core 0 allocate memory from the local cluster
     551    // for the distributed data[], trans[], twid[] buffers
     552    // and for the private upriv[] buffer
     553    if( lid == 0 )
     554    {
     555        unsigned int data_size  = (N / nclusters) * 2 * sizeof(double);
     556        unsigned int coefs_size = (rootN - 1) * 2 * sizeof(double); 
     557
     558        data[cid]   = (double *)malloc( data_size );
     559        trans[cid]  = (double *)malloc( data_size );
     560        twid[cid]   = (double *)malloc( data_size );
     561
     562        upriv       = (double *)malloc( coefs_size );
     563    }
    583564
    584565    // BARRIER
     
    586567    pthread_barrier_wait( &barrier );
    587568    get_cycle( &barrier_stop );
    588     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
    589 
    590 #if DEBUG_SLAVE
    591 printf("\n[@@@] %s : thread %d exit first barrier / cycle %d\n",
    592 __FUNCTION__, MyNum, (unsigned int)barrier_stop );
    593 #endif
    594 
    595     // allocate and initialise local array upriv[]
    596     // that is a local copy of the rootN coefs defined in umain[]
    597     upriv = (double *)malloc(2 * (rootN - 1) * sizeof(double)); 
    598     for ( i = 0 ; i < (rootN - 1) ; i++)
    599     {
    600         c_id     = i / (rootN / nclusters);
    601         c_offset = i % (rootN / nclusters);
    602         upriv[2*i]   = umain[c_id][2*c_offset];
    603         upriv[2*i+1] = umain[c_id][2*c_offset+1];
    604     }
     569    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     570
     571#if DEBUG_WORK
     572printf("\n[fft] %s : thread %d exit first barrier / cycle %d\n",
     573__FUNCTION__, tid, (unsigned int)barrier_stop );
     574#endif
     575
     576    // all threads initialize data[] local array
     577    InitD( data , MODE , tid );
     578
     579    // all threads initialize twid[] local array
     580    InitT( twid , tid );
     581   
     582    // all threads initialise private upriv[] array
     583    InitU( upriv );
     584
     585    // BARRIER
     586    get_cycle( &barrier_start );
     587    pthread_barrier_wait( &barrier );
     588    get_cycle( &barrier_stop );
     589    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     590
     591#if DEBUG_WORK
     592printf("\n[fft] %s : thread %d exit second barrier / cycle %d\n",
     593__FUNCTION__, tid, (unsigned int)barrier_stop );
     594#endif
    605595
    606596    // compute first and last rows handled by the thread
    607     MyFirst = rootN * MyNum / nthreads;
    608     MyLast  = rootN * (MyNum + 1) / nthreads;
     597    MyFirst = rootN * tid / nthreads;
     598    MyLast  = rootN * (tid + 1) / nthreads;
    609599
    610600    // perform forward FFT
    611     FFT1D( 1 , data , trans , upriv , twid , MyNum , MyFirst , MyLast );
     601    FFT1D( 1 , data , trans , upriv , twid , tid , MyFirst , MyLast );
    612602
    613603#if CHECK
     
    615605pthread_barrier_wait( &barrier );
    616606get_cycle( &barrier_stop );
    617 sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
    618 FFT1D( -1 , data , trans , upriv , twid , MyNum , MyFirst , MyLast );
     607sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     608FFT1D( -1 , data , trans , upriv , twid , tid , MyFirst , MyLast );
    619609#endif
    620610
     
    622612
    623613    // register parallel time
    624     parallel_time[MyNum] = (unsigned int)(parallel_stop - parallel_start);
    625 
    626 #if DEBUG_SLAVE
    627 printf("\n[fft] %s : thread %x completes fft / p_start %d / p_stop %d\n",
    628 __FUNCTION__, MyNum, (unsigned int)parallel_start, (unsigned int)parallel_stop );
    629 int tid;
    630 for (tid = 0 ; tid < nthreads ; tid++)
    631 {
    632     printf("- tid %d : Sequencial %d / Parallel %d / Barrier %d\n",
    633     tid , init_time, parallel_time[tid], sync_time[tid] );
    634 }
    635 #endif
    636 
    637     // exit only if MyNum != MainNum
    638     if( MyNum != MainNum ) pthread_exit( NULL );
    639 
    640 }  // end slave()
     614    parallel_time[tid] = (unsigned int)(parallel_stop - parallel_start);
     615
     616#if DEBUG_WORK
     617printf("\n[fft] %s : thread %d completes fft / p_start %d / p_stop %d\n",
     618__FUNCTION__, tid, (unsigned int)parallel_start, (unsigned int)parallel_stop );
     619#endif
     620
     621    //  work thread signals completion to main
     622    pthread_barrier_wait( parent_barrier );
     623
     624#if DEBUG_WORK
     625printf("\n[fft] %s : thread %d exit\n",
     626__FUNCTION__, tid );
     627#endif
     628
     629    //  work thread exit
     630    pthread_exit( NULL );
     631
     632}  // end work()
    641633
    642634////////////////////////////////////////////////////////////////////////////////////////
     
    724716}
    725717
    726 
    727 ////////////////////////////
    728 void InitX(double      ** x,
    729            unsigned int   mode )
     718//////////////////////////////////////////////////////////////////////////////////////
     719// Each working thread <tid> contributes to initialize (rootN / nthreads) rows,
     720// in the shared - and distributed - <data> array.
     721//////////////////////////////////////////////////////////////////////////////////////
     722void InitD(double      ** data,
     723           unsigned int   mode,
     724           unsigned int   tid )
    730725{
    731726    unsigned int    i , j;
     
    734729    unsigned int    index;
    735730
    736     for ( j = 0 ; j < rootN ; j++ )      // loop on row index
     731    // compute row_min and row_max
     732    unsigned int    row_min = tid * rows_per_thread;
     733    unsigned int    row_max = row_min + rows_per_thread;
     734
     735    for ( j = row_min ; j < row_max ; j++ )      // loop on rows
    737736    { 
    738         for ( i = 0 ; i < rootN ; i++ )  // loop on point in a row
     737        for ( i = 0 ; i < rootN ; i++ )          // loop on points in a row
    739738        { 
    740739            index     = j * rootN + i;
     
    745744            if ( mode == RANDOM )               
    746745            {
    747                 x[c_id][2*c_offset]   = ( (double)rand() ) / 65536;
    748                 x[c_id][2*c_offset+1] = ( (double)rand() ) / 65536;
     746                data[c_id][2*c_offset]   = ( (double)rand() ) / 65536;
     747                data[c_id][2*c_offset+1] = ( (double)rand() ) / 65536;
    749748            }
    750749           
     
    754753            {
    755754                double phi = (double)( 2 * PI * index) / N;
    756                 x[c_id][2*c_offset]   = cos( phi );
    757                 x[c_id][2*c_offset+1] = sin( phi );
     755                data[c_id][2*c_offset]   = cos( phi );
     756                data[c_id][2*c_offset+1] = sin( phi );
    758757            }
    759758
     
    761760            if ( mode == CONSTANT )               
    762761            {
    763                 x[c_id][2*c_offset]   = 1.0;
    764                 x[c_id][2*c_offset+1] = 0.0;
     762                data[c_id][2*c_offset]   = 1.0;
     763                data[c_id][2*c_offset+1] = 0.0;
    765764            }
    766765        }
     
    768767}
    769768
    770 /////////////////////////
    771 void InitU( double ** u )
    772 {
    773     unsigned int    q;
    774     unsigned int    j;
    775     unsigned int    base;
    776     unsigned int    n1;
    777     unsigned int    c_id;
    778     unsigned int    c_offset;
    779     double  phi;
    780     unsigned int    stop = 0;
    781 
    782     for (q = 0 ; ((unsigned int)(1 << q) < N) && (stop == 0) ; q++)
    783     { 
    784         n1 = 1 << q;
    785         base = n1 - 1;
    786         for (j = 0; (j < n1) && (stop == 0) ; j++)
    787         {
    788             if (base + j > rootN - 1) return;
    789 
    790             c_id      = (base + j) / (rootN / nclusters);
    791             c_offset  = (base + j) % (rootN / nclusters);
    792             phi = (double)(2.0 * PI * j) / (2 * n1);
    793             u[c_id][2*c_offset]   = cos( phi );
    794             u[c_id][2*c_offset+1] = -sin( phi );
    795         }
    796     }
    797 }
    798 
    799 //////////////////////////
    800 void InitT( double ** u )
     769///////////////////////////////////////////////////////////////////////////////////////
     770// Each working thread <tid> contributes to initialize (rootN / nthreads) rows,
     771// in the shared - and distributed - <twiddle> array.
     772///////////////////////////////////////////////////////////////////////////////////////
     773void InitT( double      ** twid,
     774            unsigned int   tid )
    801775{
    802776    unsigned int    i, j;
     
    806780    double  phi;
    807781
    808     for ( j = 0 ; j < rootN ; j++ )      // loop on row index
     782    // compute row_min and row_max
     783    unsigned int    row_min = tid * rows_per_thread;
     784    unsigned int    row_max = row_min + rows_per_thread;
     785
     786    for ( j = row_min ; j < row_max ; j++ )      // loop on rows
    809787    { 
    810         for ( i = 0 ; i < rootN ; i++ )  // loop on points in a row
     788        for ( i = 0 ; i < rootN ; i++ )          // loop on points in a row
    811789        { 
    812790            index     = j * rootN + i;
     
    815793
    816794            phi = (double)(2.0 * PI * i * j) / N;
    817             u[c_id][2*c_offset]   = cos( phi );
    818             u[c_id][2*c_offset+1] = -sin( phi );
     795            twid[c_id][2*c_offset]   = cos( phi );
     796            twid[c_id][2*c_offset+1] = -sin( phi );
     797        }
     798    }
     799}
     800
     801///////////////////////////////////////////////////////////////////////////////////////
     802// Each working thread initialize the private <upriv> array / (rootN - 1) entries.
     803///////////////////////////////////////////////////////////////////////////////////////
     804void InitU( double * upriv )
     805{
     806    unsigned int    q;
     807    unsigned int    j;
     808    unsigned int    base;
     809    unsigned int    n1;
     810    double  phi;
     811
     812    for (q = 0 ; ((unsigned int)(1 << q) < N) ; q++)
     813    { 
     814        n1 = 1 << q;    // n1 == 2**q
     815        base = n1 - 1;
     816        for (j = 0; (j < n1) ; j++)
     817        {
     818            if (base + j > rootN - 1) return;
     819
     820            phi = (double)(2.0 * PI * j) / (2 * n1);
     821            upriv[2*(base+j)]   = cos( phi );
     822            upriv[2*(base+j)+1] = -sin( phi );
    819823        }
    820824    }
     
    856860            double        *  upriv,           // local array containing coefs for rootN FFT
    857861            double       **  twid,            // distributed arrays containing N twiddle factors
    858             unsigned int     MyNum,           // thread continuous index
     862            unsigned int     tid,             // thread continuous index
    859863            unsigned int     MyFirst,
    860864            unsigned int     MyLast )
     
    868872get_cycle( &cycle );
    869873printf("\n[fft] %s : thread %d enter / first %d / last %d / cycle %d\n",
    870 __FUNCTION__, MyNum, MyFirst, MyLast, (unsigned int)cycle );
     874__FUNCTION__, tid, MyFirst, MyLast, (unsigned int)cycle );
    871875#endif
    872876
     
    877881get_cycle( &cycle );
    878882printf("\n[fft] %s : thread %d after first transpose / cycle %d\n",
    879 __FUNCTION__, MyNum, (unsigned int)cycle );
     883__FUNCTION__, tid, (unsigned int)cycle );
    880884if( PRINT_ARRAY ) PrintArray( tmp , N );
    881885#endif
     
    885889    pthread_barrier_wait( &barrier );
    886890    get_cycle( &barrier_stop );
    887     sync_time[MyNum] = (unsigned int)(barrier_stop - barrier_start);
     891    sync_time[tid] = (unsigned int)(barrier_stop - barrier_start);
    888892
    889893#if( DEBUG_FFT1D & 1 )
    890894get_cycle( &cycle );
    891895printf("\n[fft] %s : thread %d exit barrier after first transpose / cycle %d\n",
    892 __FUNCTION__, MyNum, (unsigned int)cycle );
     896__FUNCTION__, tid, (unsigned int)cycle );
    893897#endif
    894898
     
    902906
    903907#if( DEBUG_FFT1D & 1 )
    904 printf("\n[fft] %s : thread %d after first twiddle\n", __FUNCTION__, MyNum);
     908printf("\n[fft] %s : thread %d after first twiddle\n", __FUNCTION__, tid);
    905909if( PRINT_ARRAY ) PrintArray( tmp , N );
    906910#endif
     
    912916
    913917#if( DEBUG_FFT1D & 1 )
    914 printf("\n[fft] %s : thread %d exit barrier after first twiddle\n", __FUNCTION__, MyNum);
    915 #endif
    916 
    917     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
     918printf("\n[fft] %s : thread %d exit barrier after first twiddle\n", __FUNCTION__, tid);
     919#endif
     920
     921    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    918922
    919923    // transpose tmp to x
     
    921925
    922926#if( DEBUG_FFT1D & 1 )
    923 printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, MyNum);
     927printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, tid);
    924928if( PRINT_ARRAY ) PrintArray( x , N );
    925929#endif
     
    931935
    932936#if( DEBUG_FFT1D & 1 )
    933 printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, MyNum);
    934 #endif
    935 
    936     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
     937printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, tid);
     938#endif
     939
     940    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    937941
    938942    // do FFTs on rows of x and apply the scaling factor
     
    944948
    945949#if( DEBUG_FFT1D & 1 )
    946 printf("\n[fft] %s : thread %d after FFT on rows\n", __FUNCTION__, MyNum);
     950printf("\n[fft] %s : thread %d after FFT on rows\n", __FUNCTION__, tid);
    947951if( PRINT_ARRAY ) PrintArray( x , N );
    948952#endif
     
    954958
    955959#if( DEBUG_FFT1D & 1 )
    956 printf("\n[fft] %s : thread %d exit barrier after FFT on rows\n", __FUNCTION__, MyNum);
    957 #endif
    958     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
     960printf("\n[fft] %s : thread %d exit barrier after FFT on rows\n", __FUNCTION__, tid);
     961#endif
     962    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    959963
    960964    // transpose x to tmp
     
    962966
    963967#if( DEBUG_FFT1D & 1 )
    964 printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, MyNum);
     968printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, tid);
    965969if( PRINT_ARRAY ) PrintArray( x , N );
    966970#endif
     
    972976
    973977#if( DEBUG_FFT1D & 1 )
    974 printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, MyNum);
    975 #endif
    976 
    977     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
    978     sync_time[MyNum] += (long)(barrier_stop - barrier_start);
     978printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, tid);
     979#endif
     980
     981    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     982    sync_time[tid] += (long)(barrier_stop - barrier_start);
    979983
    980984    // copy tmp to x
     
    982986
    983987#if DEBUG_FFT1D
    984 printf("\n[fft] %s : thread %d completed\n", __FUNCTION__, MyNum);
     988printf("\n[fft] %s : thread %d completed\n", __FUNCTION__, tid);
    985989if( PRINT_ARRAY ) PrintArray( x , N );
    986990#endif
Note: See TracChangeset for help on using the changeset viewer.