Changeset 652 for trunk/user/fft/fft.c


Ignore:
Timestamp:
Nov 14, 2019, 3:56:51 PM (3 years ago)
Author:
alain
Message:

Introduce the three placement modes in "transpose", "convol', "fft" applications.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/user/fft/fft.c

    r649 r652  
    1515/*************************************************************************/
    1616
    17 ///////////////////////////////////////////////////////////////////////////
     17////////////////////////////////////////////////////////////////////////////////////////
    1818// This port of the SPLASH FFT benchmark on the ALMOS-MKH OS has been
    1919// done by Alain Greiner (august 2018).
     
    4545// that contains all coefs required for a rootN points FFT.
    4646//
    47 // There is one working thread per core.
    4847// The actual number of cores and cluster in a given hardware architecture
    4948// is obtained by the get_config() syscall (x_size, y_size, ncores).
     
    5150// The max number of cores per cluster is bounded by CORES_MAX.
    5251//
    53 // Several configuration parameters can be defined below:
    54 //  - PRINT_ARRAY : Print out complex data points arrays.
    55 //  - CHECK       : Perform both FFT and inverse FFT to check output/input.
    56 //  - DEBUG_MAIN  : Display intermediate results in main()
    57 //  - DEBUG_FFT1D : Display intermediate results in FFT1D()
    58 //  - DEBUG_ROW   : Display intermedite results in FFTrow()
     52// The number N of working threads is always defined by the number of cores availables
     53// in the architecture, but this application supports three placement modes.
     54// In all modes, the working threads are identified by the [tid] continuous index
     55// in range [0, NTHREADS-1], and defines how the lines are shared amongst the threads.
     56// This continuous index can always be decomposed in two continuous sub-indexes:
     57// tid == cid * ncores + lid,  where cid is in [0,NCLUSTERS-1] and lid in [0,NCORES-1].
     58//
     59// - NO_PLACEMENT: the main thread is itsef a working thread. The (N_1) other working
     60//   threads are created by the main thread, but the placement is done by the OS, using
     61//   the DQDT for load balancing, and two working threads can be placed on the same core.
     62//   The [cid,lid] are only abstract identifiers, and cannot be associated to a physical
     63//   cluster or a physical core. In this mode, the main thread run on any cluster,
     64//   but has tid = 0 (i.e. cid = 0 & tid = 0).
     65//
     66// - EXPLICIT_PLACEMENT: the main thread is again a working thread, but the placement of
     67//   of the threads on the cores is explicitely controled by the main thread to have
     68//   exactly one working thread per core, and the [cxy][lpid] core coordinates for a given
     69//   thread[tid] can be directly derived from the [tid] value: [cid] is an alias for the
     70//   physical cluster identifier, and [lid] is the local core index.
     71//
     72// - PARALLEL_PLACEMENT: the main thread is not anymore a working thread, and uses the
     73//   non standard pthread_parallel_create() function to avoid the costly sequencial
     74//   loops for pthread_create() and pthread_join(). It garanty one working thread
     75//   per core, and the same relation between the thread[tid] and the core[cxy][lpid].
     76//
     77// Several others configuration parameters can be defined below:
     78//  - USE_DQT_BARRIER : use a hierarchical barrier for working threads synchro
     79//  - PRINT_ARRAY     : Print out complex data points arrays.
     80//  - CHECK           : Perform both FFT and inverse FFT to check output/input.
     81//  - DEBUG_MAIN      : Display intermediate results in main()
     82//  - DEBUG_FFT1D     : Display intermediate results in FFT1D()
     83//  - DEBUG_ROW       : Display intermedite results in FFTrow()
    5984//
    6085// Regarding final instrumentation:
     
    6691//   is computed by each thread(i) in the work() function.
    6792// The results are displayed on the TXT terminal, and registered on disk.
    68 ///////////////////////////////////////////////////////////////////////////
     93///////////////////////////////////////////////////////////////////////////////////////
    6994
    7095#include <math.h>
     
    92117// parameters
    93118
     119#define NO_PLACEMENT            1
     120#define EXPLICIT_PLACEMENT      0
     121#define PARALLEL_PLACEMENT      0
     122
    94123#define DEFAULT_M               18              // 256 K complex points
    95124#define USE_DQT_BARRIER         1               // use DDT barrier if non zero
     
    110139/////////////////////////////////////////////////////////////////////////////////////
    111140
    112 // work function arguments
    113 typedef struct work_args_s
    114 {
    115     unsigned int        tid;               // thread continuous index
    116     unsigned int        lid;               // core local index
    117     unsigned int        cid;               // cluster continuous index
    118     pthread_barrier_t * parent_barrier;    // parent barrier to signal completion
    119 }
    120 work_args_t;
     141unsigned int   x_size;                     // platform global parameter
     142unsigned int   y_size;                     // platform global parameter
     143unsigned int   ncores;                     // platform global parameter
    121144
    122145unsigned int   nthreads;                   // total number of threads (one thread per core)
     
    130153// arrays of pointers on distributed buffers (one sub-buffer per cluster)
    131154double *       data[CLUSTERS_MAX];         // original time-domain data
    132 double *       trans[CLUSTERS_MAX];        // used as auxiliary space for transpose
     155double *       trans[CLUSTERS_MAX];        // used as auxiliary space for fft
    133156double *       twid[CLUSTERS_MAX];         // twiddle factor : exp(-2iPI*k*n/N)
    134157double *       bloup[CLUSTERS_MAX];        // used as auxiliary space for DFT
     
    146169pthread_barrierattr_t  barrier_attr;
    147170
    148 /////////////////////////////////////////////////////////////////////////////////////
    149 //             Global variables required by parallel_pthread_create()
    150 /////////////////////////////////////////////////////////////////////////////////////
    151 
    152 // 2D arrays of input arguments for the <work> threads
    153 // These arrays are initialised by the application main thread
    154 
    155 work_args_t       work_args[CLUSTERS_MAX][CORES_MAX];  // work function arguments
    156 work_args_t     * work_ptrs[CLUSTERS_MAX][CORES_MAX];  // pointers on arguments
    157 
    158 // 1D array of barriers to allow the <work> threads to signal termination
    159 // this array is initialised in each cluster by the <build[cxy][0]> thread
    160  
    161 pthread_barrier_t parent_barriers[CLUSTERS_MAX];        // termination barrier
     171//return values at thread exit
     172unsigned int   THREAD_EXIT_SUCCESS = 0;
     173unsigned int   THREAD_EXIT_FAILURE = 1;
     174
     175// main thread continuous index
     176unsigned int     tid_main;
     177
     178// array of kernel thread identifiers / indexed by [tid]
     179pthread_t      work_trdid[CLUSTERS_MAX * CORES_MAX];   
     180
     181// array of thread attributes / indexed by [tid]
     182pthread_attr_t work_attr[CLUSTERS_MAX * CORES_MAX];
     183
     184// array of work function arguments / indexed by [tid]
     185pthread_parallel_work_args_t work_args[CLUSTERS_MAX * CORES_MAX];
    162186
    163187/////////////////////////////////////////////////////////////////////////////////////
     
    165189/////////////////////////////////////////////////////////////////////////////////////
    166190
    167 void work( work_args_t * args );
     191void work( pthread_parallel_work_args_t * args );
    168192
    169193double CheckSum( void );
     
    234258    int                 error;
    235259
    236     unsigned int        x_size;            // number of clusters per row
    237     unsigned int        y_size;            // number of clusters per column
    238     unsigned int        ncores;            // max number of cores per cluster
    239 
    240 
    241     unsigned int        x;                 // current index for cluster X coordinate
    242     unsigned int        y;                 // current index for cluster Y coordinate
    243     unsigned int        lid;               // current index for core in a cluster
    244260    unsigned int        tid;               // continuous thread index
    245     unsigned int        cid;               // cluster continuous index
    246     unsigned int        cxy;               // hardware specific cluster identifier
    247261
    248262    char                name[64];          // instrumentation file name
     
    265279    int                 pid = getpid();
    266280
     281    // check placement mode
     282    if( (NO_PLACEMENT + EXPLICIT_PLACEMENT + PARALLEL_PLACEMENT) != 1 )
     283    {
     284        printf("\n[fft error] illegal placement mode\n");
     285        exit( 0 );
     286    }
     287
    267288    // get FFT application start cycle
    268289    get_cycle( &start_init_cycle );
     
    295316        exit( 0 );
    296317    }
     318
     319    // get identifiers for core executing main
     320    unsigned int  cxy_main;
     321    unsigned int  lid_main;
     322    get_core_id( &cxy_main , &lid_main );
    297323
    298324    // compute nthreads and nclusters
     
    317343    }
    318344
    319     printf("\n[fft] starts / %d points / %d thread(s) / PID %x / cycle %d\n",
    320     N, nthreads, pid, (unsigned int)start_init_cycle );
    321 
    322     // build instrumentation file name
    323     if( USE_DQT_BARRIER )
    324     snprintf( name , 64 , "p_fft_dqt_%d_%d_%d", N , x_size * y_size , ncores );
    325     else
    326     snprintf( name , 64 , "p_fft_smp_%d_%d_%d", N , x_size * y_size , ncores );
    327 
    328     // build pathname
     345    // define instrumentation file name
     346    if( NO_PLACEMENT )
     347    {
     348        printf("\n[fft] starts / %d points / %d thread(s) / PID %x / NO_PLACE\n",
     349        N, nthreads, pid );
     350
     351        // build instrumentation file name
     352        if( USE_DQT_BARRIER )
     353        snprintf( name , 64 , "fft_dqt_no_place_%d_%d_%d", M , x_size * y_size , ncores );
     354        else
     355        snprintf( name , 64 , "fft_smp_no_place_%d_%d_%d", M , x_size * y_size , ncores );
     356    }
     357
     358    if( EXPLICIT_PLACEMENT )
     359    {
     360        printf("\n[fft] starts / %d points / %d thread(s) / PID %x / EXPLICIT\n",
     361        N, nthreads, pid );
     362
     363        // build instrumentation file name
     364        if( USE_DQT_BARRIER )
     365        snprintf( name , 64 , "fft_dqt_explicit_%d_%d_%d", M , x_size * y_size , ncores );
     366        else
     367        snprintf( name , 64 , "fft_smp_explicit_%d_%d_%d", M , x_size * y_size , ncores );
     368    }
     369
     370    if( PARALLEL_PLACEMENT )
     371    {
     372        printf("\n[fft] starts / %d points / %d thread(s) / PID %x / PARALLEL\n",
     373        N, nthreads, pid );
     374
     375        // build instrumentation file name
     376        if( USE_DQT_BARRIER )
     377        snprintf( name , 64 , "fft_dqt_parallel_%d_%d_%d", M , x_size * y_size , ncores );
     378        else
     379        snprintf( name , 64 , "fft_smp_parallel_%d_%d_%d", M , x_size * y_size , ncores );
     380    }
     381
     382    // build instrumentation file pathname
    329383    snprintf( path , 128 , "/home/%s", name );
    330384
     
    339393#if DEBUG_MAIN
    340394get_cycle( &debug_cycle );
    341 printf("\n[fft] main open file <%s> at cycle %d\n",
     395printf("\n[fft] main open instrumentation file <%s> at cycle %d\n",
    342396path, (unsigned int)debug_cycle );
    343397#endif
     
    381435#if DEBUG_MAIN
    382436get_cycle( &debug_cycle );
    383 printf("\n[fft] main completes barrier init at cycle %d\n",
     437printf("\n[fft] main completes sequencial initialisation at cycle %d\n",
    384438(unsigned int)debug_cycle );
    385439#endif
    386 
    387     // build array of arguments for the <work> threads
    388     for (x = 0 ; x < x_size ; x++)
    389     {
    390         for (y = 0 ; y < y_size ; y++)
    391         {
    392             // compute cluster identifier
    393             cxy = HAL_CXY_FROM_XY( x , y );
    394 
    395             for ( lid = 0 ; lid < ncores ; lid++ )
    396             {
    397                 // compute cluster continuous index
    398                 cid = (x * y_size) + y;
    399 
    400                 // compute work thread continuous index
    401                 tid = (cid * ncores) + lid;
    402                
    403                 // initialize 2D array of arguments
    404                 work_args[cxy][lid].tid            = tid;
    405                 work_args[cxy][lid].lid            = lid;
    406                 work_args[cxy][lid].cid            = cid;
    407                 work_args[cxy][lid].parent_barrier = &parent_barriers[cxy];
    408 
    409                 // initialize 2D array of pointers
    410                 work_ptrs[cxy][lid] = &work_args[cxy][lid];
    411             }
    412         }
    413     }
    414440
    415441    // register sequencial time
     
    417443    init_time = (unsigned int)(end_init_cycle - start_init_cycle);
    418444
     445    //////////////////
     446    if( NO_PLACEMENT )
     447    {
     448        // the tid value for the main thread is always 0
     449        // main thread creates new threads with tid in [1,nthreads-1] 
     450        unsigned int tid;
     451        for ( tid = 0 ; tid < nthreads ; tid++ )
     452        {
     453            // register tid value in work_args[tid] array
     454            work_args[tid].tid = tid;
     455           
     456            // create other threads
     457            if( tid > 0 )
     458            {
     459                if ( pthread_create( &work_trdid[tid],
     460                                     NULL,                  // no attribute
     461                                     &work,
     462                                     &work_args[tid] ) )
     463                {
     464                    printf("\n[fft error] cannot create thread %d\n", tid );
     465                    exit( 0 );
     466                }
     467
    419468#if DEBUG_MAIN
    420 printf("\n[fft] main completes <work> threads arguments at cycle %d\n",
    421 (unsigned int)end_init_cycle );
    422 #endif
    423 
    424     // create and execute the working threads
    425     if( pthread_parallel_create( root_level,
    426                                  &work,
    427                                  &work_ptrs[0][0],
    428                                  &parent_barriers[0] ) )
    429     {
    430         printf("\n[fft error] creating threads\n");
    431         exit( 0 );
     469printf("\n[fft] main created thread %d\n", tid );
     470#endif
     471
     472            }
     473            else
     474            {
     475                tid_main = 0;
     476            }
     477        }  // end for tid
     478
     479        // main thread calls itself the execute() function
     480        work( &work_args[0] );
     481
     482        // main thread wait other threads completion
     483        for ( tid = 1 ; tid < nthreads ; tid++ )
     484        {
     485            unsigned int * status;
     486
     487            // main wait thread[tid] status
     488            if ( pthread_join( work_trdid[tid], (void*)(&status)) )
     489            {
     490                printf("\n[fft error] main cannot join thread %d\n", tid );
     491                exit( 0 );
     492            }
     493       
     494            // check status
     495            if( *status != THREAD_EXIT_SUCCESS )
     496            {
     497                printf("\n[fft error] thread %x returned failure\n", tid );
     498                exit( 0 );
     499            }
     500
     501#if DEBUG_MAIN
     502printf("\n[fft] main successfully joined thread %x\n", tid );
     503#endif
     504       
     505        }  // end for tid
     506
     507    }  // end if no_placement
     508
     509    ////////////////////////
     510    if( EXPLICIT_PLACEMENT )
     511    {
     512        // main thread places each thread[tid] on a specific core[cxy][lid]
     513        // but the actual thread creation is sequencial
     514        unsigned int x;
     515        unsigned int y;
     516        unsigned int l;
     517        unsigned int cxy;                   // cluster identifier
     518        unsigned int tid;                   // thread continuous index
     519
     520        for( x = 0 ; x < x_size ; x++ )
     521        {
     522            for( y = 0 ; y < y_size ; y++ )
     523            {
     524                cxy = HAL_CXY_FROM_XY( x , y );
     525                for( l = 0 ; l < ncores ; l++ )
     526                {
     527                    // compute thread continuous index
     528                    tid = (((x  * y_size) + y) * ncores) + l;
     529
     530                    // register tid value in work_args[tid] array
     531                    work_args[tid].tid = tid;
     532
     533                    // no thread created on the core running the main
     534                    if( (cxy != cxy_main) || (l != lid_main) )
     535                    {
     536                        // define thread attributes
     537                        work_attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED |
     538                                                    PT_ATTR_CORE_DEFINED;
     539                        work_attr[tid].cxy        = cxy;
     540                        work_attr[tid].lid        = l;
     541 
     542                        // create thread[tid] on core[cxy][l]
     543                        if ( pthread_create( &work_trdid[tid],   
     544                                             &work_attr[tid],   
     545                                             &work,
     546                                             &work_args[tid] ) )       
     547                        {
     548                            printf("\n[fft error] cannot create thread %d\n", tid );
     549                            exit( 0 );
     550                        }
     551#if DEBUG_MAIN
     552printf("\n[fft] main created thread[%d] on core[%x,%d]\n", tid, cxy, l );
     553#endif
     554                    }
     555                    else
     556                    {
     557                        tid_main = tid;
     558                    }
     559                }
     560            }
     561        }
     562
     563        // main thread calls itself the execute() function
     564        work( &work_args[tid_main] );
     565
     566        // main thread wait other threads completion
     567        for( tid = 0 ; tid < nthreads ; tid++ )
     568        {
     569            // no other thread on the core running the main
     570            if( tid != tid_main )
     571            {
     572                unsigned int * status;
     573
     574                // wait thread[tid]
     575                if( pthread_join( work_trdid[tid] , (void*)(&status) ) )
     576                {
     577                    printf("\n[fft error] main cannot join thread %d\n", tid );
     578                    exit( 0 );
     579                }
     580       
     581                // check status
     582                if( *status != THREAD_EXIT_SUCCESS )
     583                {
     584                    printf("\n[fft error] thread %d returned failure\n", tid );
     585                    exit( 0 );
     586                }
     587#if DEBUG_MAIN
     588printf("\n[fft] main joined thread %d on core[%x,%d]\n", tid , cxy , l );
     589#endif
     590            }
     591        }
     592    }  // end if explicit_placement
     593
     594    ////////////////////////
     595    if( PARALLEL_PLACEMENT )
     596    {
     597        // create and execute the working threads
     598        if( pthread_parallel_create( root_level , &work ) )
     599        {
     600            printf("\n[fft error] cannot create threads\n");
     601            exit( 0 );
     602        }
    432603    }
    433604
     
    533704// This function is executed in parallel by all <work> threads.
    534705/////////////////////////////////////////////////////////////////
    535 void work( work_args_t * args )
     706void work( pthread_parallel_work_args_t * args )
    536707{
    537708    unsigned int        tid;              // this thread continuous index
     
    549720    unsigned long long  barrier_stop;
    550721
     722    get_cycle( &parallel_start );
     723
    551724    // get thread arguments
    552725    tid            = args->tid;
    553     lid            = args->lid;             
    554     cid            = args->cid;             
    555     parent_barrier = args->parent_barrier;
    556 
    557     get_cycle( &parallel_start );
    558 
     726    parent_barrier = args->barrier;
     727
     728    // compute lid and cid from tid
     729    lid            = tid % ncores;             
     730    cid            = tid / ncores;
     731           
    559732#if DEBUG_WORK
    560733printf("\n[fft] %s : thread %d enter / cycle %d\n",
     
    602775printf("\n[fft] %s : thread %d exit barrier for buffer allocation / cycle %d\n",
    603776__FUNCTION__, tid, (unsigned int)barrier_stop );
    604 #endif
    605 
    606 #if DISPLAY_SCHED_AND_VMM
    607     unsigned int x_size;
    608     unsigned int y_size;
    609     unsigned int ncores;
    610     get_config( &x_size , &y_size , &ncores );
    611     unsigned int x   = cid / y_size;
    612     unsigned int y   = cid % y_size;
    613     unsigned int cxy = HAL_CXY_FROM_XY( x , y );
    614 display_sched( cxy , lid );
    615 if( lid == 0 ) display_vmm( cxy , getpid() , 0 );
    616777#endif
    617778
     
    9191080// contained in the distributed buffers x[nclusters][points_per_cluster].
    9201081// It handles the (N) points 1D array as a (rootN*rootN) points 2D array.
    921 // 1) it transpose (rootN/nthreads ) rows from x to tmp.
     1082// 1) it fft (rootN/nthreads ) rows from x to tmp.
    9221083// 2) it make (rootN/nthreads) FFT on the tmp rows and apply the twiddle factor.
    923 // 3) it transpose (rootN/nthreads) columns from tmp to x.
     1084// 3) it fft (rootN/nthreads) columns from tmp to x.
    9241085// 4) it make (rootN/nthreads) FFT on the x rows.
    9251086// It calls the FFTRow() 2*(rootN/nthreads) times to perform the in place FFT
     
    9461107#endif
    9471108
    948     // transpose (rootN/nthreads) rows from x to tmp
     1109    // fft (rootN/nthreads) rows from x to tmp
    9491110    Transpose( x , tmp , MyFirst , MyLast );
    9501111
    9511112#if( DEBUG_FFT1D & 1 )
    9521113get_cycle( &cycle );
    953 printf("\n[fft] %s : thread %d after first transpose / cycle %d\n",
     1114printf("\n[fft] %s : thread %d after first fft / cycle %d\n",
    9541115__FUNCTION__, tid, (unsigned int)cycle );
    9551116if( PRINT_ARRAY ) PrintArray( tmp , N );
     
    9641125#if( DEBUG_FFT1D & 1 )
    9651126get_cycle( &cycle );
    966 printf("\n[fft] %s : thread %d exit barrier after first transpose / cycle %d\n",
     1127printf("\n[fft] %s : thread %d exit barrier after first fft / cycle %d\n",
    9671128__FUNCTION__, tid, (unsigned int)cycle );
    9681129#endif
     
    9921153    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    9931154
    994     // transpose tmp to x
     1155    // fft tmp to x
    9951156    Transpose( tmp , x , MyFirst , MyLast );
    9961157
    9971158#if( DEBUG_FFT1D & 1 )
    998 printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, tid);
     1159printf("\n[fft] %s : thread %d after second fft\n", __FUNCTION__, tid);
    9991160if( PRINT_ARRAY ) PrintArray( x , N );
    10001161#endif
     
    10061167
    10071168#if( DEBUG_FFT1D & 1 )
    1008 printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, tid);
     1169printf("\n[fft] %s : thread %d exit barrier after second fft\n", __FUNCTION__, tid);
    10091170#endif
    10101171
     
    10331194    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    10341195
    1035     // transpose x to tmp
     1196    // fft x to tmp
    10361197    Transpose( x , tmp , MyFirst , MyLast );
    10371198
    10381199#if( DEBUG_FFT1D & 1 )
    1039 printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, tid);
     1200printf("\n[fft] %s : thread %x after third fft\n", __FUNCTION__, tid);
    10401201if( PRINT_ARRAY ) PrintArray( x , N );
    10411202#endif
     
    10471208
    10481209#if( DEBUG_FFT1D & 1 )
    1049 printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, tid);
     1210printf("\n[fft] %s : thread %d exit barrier after third fft\n", __FUNCTION__, tid);
    10501211#endif
    10511212
Note: See TracChangeset for help on using the changeset viewer.