Changeset 637 for trunk/libs


Ignore:
Timestamp:
Jul 18, 2019, 2:06:55 PM (5 years ago)
Author:
alain
Message:

Introduce the non-standard pthread_parallel_create() system call
and re-write the <fft> and <sort> applications to improve the
intrinsic paralelism in applications.

Location:
trunk/libs
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/libs/libalmosmkh/almosmkh.c

    r626 r637  
    22 * almosmkh.c - User level ALMOS-MKH specific library implementation.
    33 *
    4  * Author     Alain Greiner (2016,2017,2018)
     4 * Author     Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
     
    2424#include <almosmkh.h>
    2525#include <hal_user.h>
     26#include <hal_macros.h>
    2627#include <hal_shared_types.h>
    2728#include <syscalls_numbers.h>
     
    3233#include <mman.h>
    3334
    34 #define  MALLOC_DEBUG    0
     35#define  DEBUG_REMOTE_MALLOC     0
     36#define  DEBUG_PTHREAD_PARALLEL  1
    3537 
    36 /////////////     Non standard system calls    /////////////////////////////////
     38//////////////////////////////////////////////////////////////////////////////////////
     39/////////////     Non standard system calls    ///////////////////////////////////////
     40//////////////////////////////////////////////////////////////////////////////////////
    3741
    3842//////////////////////////
     
    6367}
    6468
    65 /////////////////////////////////
    66 int get_core( unsigned int * cxy,
    67               unsigned int * lid )
    68 {
    69     return hal_user_syscall( SYS_GET_CORE,
     69////////////////////////////////////
     70int get_core_id( unsigned int * cxy,
     71                 unsigned int * lid )
     72{
     73    return hal_user_syscall( SYS_GET_CORE_ID,
    7074                             (reg_t)cxy,
    7175                             (reg_t)lid, 0, 0 );
     76}
     77
     78/////////////////////////////////////
     79int get_nb_cores( unsigned int   cxy,
     80                  unsigned int * ncores )
     81{
     82    return hal_user_syscall( SYS_GET_NB_CORES,
     83                             (reg_t)cxy,
     84                             (reg_t)ncores, 0, 0 );
     85}
     86
     87///////////////////////////////////////////
     88int get_best_core( unsigned int   base_cxy,
     89                   unsigned int   level,
     90                   unsigned int * cxy,
     91                   unsigned int * lid )
     92{
     93    return hal_user_syscall( SYS_GET_BEST_CORE,
     94                             (reg_t)base_cxy,
     95                             (reg_t)level,
     96                             (reg_t)cxy,
     97                             (reg_t)lid );
    7298}
    7399
     
    250276}  // end get_string()
    251277
    252 
    253 ///////////////    non standard debug functions    //////////////////////////
     278//////////////////////////////////////////////////////////////////////////////////////
     279///////////////    non standard debug functions    ///////////////////////////////////
     280//////////////////////////////////////////////////////////////////////////////////////
    254281
    255282////////////////////////////////////
     
    496523
    497524
    498 ///////////////    non standard malloc functions    //////////////////////////
     525/////////////////////////////////////////////////////////////////////////////////////////
     526///////////////    non standard remote_malloc    ////////////////////////////////////////
     527/////////////////////////////////////////////////////////////////////////////////////////
    499528
    500529/////////////////////////////////////////////////////////////////////////////////////////
    501530// Global variable defining the allocator array (one per cluster)
    502531// This array (about 16 Kbytes ) will be stored in the data segment
    503 // of any application linked with this malloc libray.
     532// of any application linked with this libray.
    504533/////////////////////////////////////////////////////////////////////////////////////////
    505534
     
    546575////////////////////////////////////////////////////////////////////////////////////////////
    547576
    548 #if MALLOC_DEBUG
     577#if DEBUG_REMOTE_MALLOC
    549578static void display_free_array( unsigned int cxy )
    550579{
     
    594623    unsigned int   iter;             // iterator
    595624
    596 #if MALLOC_DEBUG
    597 printf("\n[MALLOC] %s : enter for store[%x] / size = %x\n",
    598 __FUNCTION__, cxy, store_size );
     625#if DEBUG_REMOTE_MALLOC
     626unsigned int core_cxy;
     627unsigned int core_lid;
     628get_core_id( &core_cxy , &core_lid );
     629printf("\n[%s] core[%x,%d] enter for store[%x] / size = %x\n",
     630__FUNCTION__, core_cxy, core_lid, cxy, store_size );
    599631#endif
    600632
     
    635667    }
    636668
    637 #if MALLOC_DEBUG
    638 printf("\n[MALLOC] %s : mmap done for store[%x] / base = %x\n",
    639 __FUNCTION__, cxy, store_base );
     669#if DEBUG_REMOTE_MALLOC
     670printf("\n[%s] core[%x,%d] created vseg %x for store[%x]\n",
     671__FUNCTION__, core_cxy, core_lid, store_base, cxy );
    640672#endif
    641673
     
    656688    }
    657689
    658     // DEPRECATED: we don't reset the alloc_base array
    659     // because we don't want to allocate the physical memory
    660     // when the heap is created  [AG]
    661     // memset( (void *)alloc_base , 0 , alloc_size );
    662  
    663690    // split the store into various sizes blocks,
    664691    // initializes the free[] array and NEXT pointers
     
    690717
    691718
    692 #if MALLOC_DEBUG
    693 printf("\n[MALLOC] %s : completes store[%x] initialisation\n",
    694 __FUNCTION__, cxy );
    695 
     719#if DEBUG_REMOTE_MALLOC
     720printf("\n[%s] core[%x,%d] completed store[%x] initialisation\n",
     721__FUNCTION__, core_cxy, core_lid, cxy );
     722#endif
     723
     724#if (DEBUG_REMOTE_MALLOC & 1)
    696725display_free_array( cxy );
    697726#endif
     
    762791    int error;
    763792
    764 #if MALLOC_DEBUG
    765 printf("\n[MALLOC] %s : enter for size = %x / cxy = %x\n",
    766 __FUNCTION__ , size , cxy );
     793#if DEBUG_REMOTE_MALLOC
     794unsigned int core_cxy;
     795unsigned int core_lid;
     796get_core_id( &core_cxy , &core_lid );
     797printf("\n[%s] core[%x,%d] enter for size = %x / target_cxy = %x\n",
     798__FUNCTION__ , core_cxy, core_lid, size , cxy );
    767799#endif
    768800
     
    828860    unsigned char * ptr    = (unsigned char*)(store[cxy].alloc_base + offset);
    829861
    830     // DEPRECATED : we cannot check the alloc[] array,
    831     // because it has not been initialised by store_init,
    832     // to avoid physical memory allocation at heap creation [AG]
    833     // if ( *ptr != 0 )
    834     // {
    835     //    pthread_mutex_unlock( &store[cxy].mutex );
    836     //    printf("\n[PANIC] in %s : allocate an already allocated block...\n",
    837     //    __FUNCTION__ );
    838     //    return NULL;
    839     // }
    840 
    841862    // update alloc_array
    842863    *ptr = requested_index;
     
    845866    pthread_mutex_unlock( &store[cxy].mutex );
    846867 
    847 #if MALLOC_DEBUG
    848 printf("\n[MALLOC] %s : exit / base = %x / size = %x / from store[%x]\n",
    849 __FUNCTION__, base , size , cxy );
     868#if DEBUG_REMOTE_MALLOC
     869printf("\n[%s] core[%x,%d] exit / base = %x / size = %x / from store[%x]\n",
     870__FUNCTION__, core_cxy, core_lid, base , size , cxy );
    850871#endif
    851872
     
    853874
    854875} // end remote_malloc()
    855 
    856 
    857876
    858877//////////////////////////////////////////
     
    920939
    921940    return new_ptr;
    922 }
     941
     942}  // end remote_realloc()
     943
    923944
    924945//////////////////////////////////////////////////////
     
    9911012{
    9921013
    993 #if MALLOC_DEBUG
     1014#if DEBUG_REMOTE_MALLOC
    9941015printf("\n[MALLOC] %s : enter for block = %x / cxy = %x\n",
    9951016__FUNCTION__, ptr, cxy );
     
    10521073    pthread_mutex_unlock( &store[cxy].mutex );
    10531074
    1054 #if MALLOC_DEBUG
     1075#if DEBUG_REMOTE_MALLOC
    10551076printf("\n[MALLOC] %s : conmpletes for block = %x / cxy = %x\n",
    10561077__FUNCTION__, ptr, cxy );
     
    10581079
    10591080} // end remote_free()
     1081
     1082/////////////////////////////////////////////////////////////////////////////////////////
     1083///////////////    non standard pthread_parallel_create    //////////////////////////////
     1084/////////////////////////////////////////////////////////////////////////////////////////
     1085
     1086#define X_MAX                   16              // max number of clusters in a row
     1087#define Y_MAX                   16              // max number of clusters in a column
     1088#define CLUSTERS_MAX            X_MAX * Y_MAX
     1089#define LEVEL_MAX               5
     1090#define CORES_MAX               4               // max number of cores per cluster
     1091
     1092typedef struct build_args_s           
     1093{
     1094    unsigned char       cxy;                    // this thread cluster identifier
     1095    unsigned char       level;                  // this thread level in quad-tree
     1096    unsigned char       parent_cxy;             // parent thread cluster identifier
     1097    unsigned char       root_level;             // quad-tree root level
     1098    void              * work_func;              // pointer on work function pointer
     1099    void              * work_args_array;        // pointer on 2D array of pointers
     1100    pthread_barrier_t * parent_barriers_array;  // pointer on 1D array of barriers
     1101    unsigned int        error;                  // return value : 0 if success
     1102}
     1103build_args_t;
     1104
     1105/////////////////////////////////////////////////////////////////////////////////////////
     1106//      Global variables used for inter-thread communications
     1107/////////////////////////////////////////////////////////////////////////////////////////
     1108
     1109pthread_attr_t    build_attr   [CLUSTERS_MAX][LEVEL_MAX];   // POSIX thread attributes
     1110
     1111build_args_t      build_args   [CLUSTERS_MAX][LEVEL_MAX];   // build function arguments
     1112
     1113pthread_barrier_t build_barrier[CLUSTERS_MAX][LEVEL_MAX];   // parent/child synchro
     1114
     1115pthread_attr_t    work_attr    [CLUSTERS_MAX][CORES_MAX];    // POSIX thread attributes
     1116
     1117//////////////////////////////////////////////////////////
     1118static void pthread_recursive_build( build_args_t * args )
     1119{
     1120    unsigned int   trdid;         // unused (required by pthread_create()
     1121
     1122    // get arguments
     1123    unsigned int        cxy                   = args->cxy;
     1124    unsigned int        level                 = args->level;
     1125    unsigned int        parent_cxy            = args->parent_cxy;
     1126    unsigned int        root_level            = args->root_level;
     1127    void              * work_func             = args->work_func;
     1128    void              * work_args_array       = args->work_args_array;
     1129    pthread_barrier_t * parent_barriers_array = args->parent_barriers_array;
     1130
     1131    // set error default value
     1132    build_args[cxy][level].error = 0;
     1133
     1134    ///////////////////////////////////////////////////////////
     1135    if( level == 0 )             // children are "work" threads
     1136    {
     1137        unsigned int   lid;           // core local index
     1138        unsigned int   ncores;        // number of cores in a cluster
     1139
     1140        // get number of cores per cluster
     1141        get_nb_cores( cxy , &ncores );
     1142
     1143        // kill process if no active core in cluster
     1144        // TODO this "if" should be replaced by an "assert" [AG]
     1145        if( ncores == 0 )
     1146        {
     1147            printf("\n[PANIC] in %s : no active core in cluster %x\n",
     1148            __FUNCTION__ , cxy );
     1149
     1150            // report error to parent
     1151            build_args[parent_cxy][level+1].error = 1;
     1152
     1153            // kill process
     1154            exit( EXIT_FAILURE );
     1155        }
     1156
     1157        // initialize the parent_barrier
     1158        if( pthread_barrier_init( &parent_barriers_array[cxy] , NULL , ncores + 1 ) )
     1159        {
     1160            printf("\n[ERROR] in %s : cannot initialise barrier for build thread[%x][%d]\n",
     1161            __FUNCTION__ , cxy , level );
     1162
     1163            // report error to parent
     1164            build_args[parent_cxy][level+1].error = 1;
     1165        }
     1166
     1167#if DEBUG_PTHREAD_PARALLEL
     1168printf("\n[%s] <build> thread[%x][%d] created barrier / %d children\n",
     1169__FUNCTION__, cxy, level, ncores + 1 );
     1170#endif
     1171        // create (ncores) "work" threads
     1172        for ( lid = 0 ; lid < ncores ; lid++ )
     1173        {
     1174            // set attributes for thread[cxy][lid]
     1175            work_attr[cxy][lid].attributes = PT_ATTR_DETACH |
     1176                                             PT_ATTR_CLUSTER_DEFINED |
     1177                                             PT_ATTR_CORE_DEFINED;
     1178            work_attr[cxy][lid].cxy        = cxy;
     1179            work_attr[cxy][lid].lid        = lid;
     1180
     1181            // compute pointer on thread[cxy][lid] arguments
     1182            void * work_args = *((void **)work_args_array + (cxy * CORES_MAX) + lid);
     1183
     1184            // create thread
     1185            if ( pthread_create( &trdid,                  // unused
     1186                                 &work_attr[cxy][lid],
     1187                                 work_func,
     1188                                 work_args ) )
     1189            {
     1190                printf("\n[ERROR] in %s : cannot create work thread[%x,%x]\n",
     1191                __FUNCTION__ , cxy , lid );
     1192
     1193                // report error to parent
     1194                build_args[parent_cxy][level+1].error = 1;
     1195            }
     1196
     1197#if DEBUG_PTHREAD_PARALLEL
     1198printf("\n[%s] <build> thread[%x][%d] created <work> thread[%x][%d]\n",
     1199__FUNCTION__, cxy, level, cxy, lid );
     1200#endif
     1201        }
     1202
     1203        // wait on barrier until "work" children threads completed
     1204        if( pthread_barrier_wait( &parent_barriers_array[cxy] ) )
     1205        {
     1206            printf("\n[ERROR] in %s / first barrier for <build> thread[%x][%d]\n",
     1207            __FUNCTION__ , cxy , level );
     1208
     1209            // report error to parent
     1210            build_args[parent_cxy][level+1].error = 1;
     1211        }
     1212
     1213#if DEBUG_PTHREAD_PARALLEL
     1214printf("\n[%s] <build> thread[%x][%d] resume after children completion\n",
     1215__FUNCTION__, cxy, level );
     1216#endif
     1217
     1218    }  // end level == 0
     1219
     1220    ////////////////////////////////////////////////////////////
     1221    else                        // children are "build" threads
     1222    {
     1223        // the 4 children threads can be created in any core of each quarters
     1224        // of the parent macro-cluster
     1225
     1226        unsigned int parent_x;          // X coordinate of parent macro-cluster
     1227        unsigned int parent_y;          // Y coordinate of parent macro-cluster
     1228        unsigned int child_x;           // X coordinate of child macro-cluster
     1229        unsigned int child_y;           // Y coordinate of child macro-cluster
     1230        unsigned int child_cxy[2][2];   // selected cluster for child thread
     1231        unsigned int child_lid[2][2];   // selected core index for child thread
     1232        int          child_sts[2][2];   // -1 if error / 0 if success / +1 if not found
     1233        unsigned int x;                 // X loop index for children
     1234        unsigned int y;                 // Y loop index for children
     1235       
     1236        unsigned int nb_children = 0;
     1237
     1238        // get parent macro-cluster mask and half-size from level
     1239        unsigned int mask = (1 << level) - 1;
     1240        unsigned int half = (level > 0) ? (1 << (level - 1)) : 0;
     1241
     1242        // get parent macro-cluster coordinates
     1243        parent_x = HAL_X_FROM_CXY( cxy ) & ~mask;
     1244        parent_y = HAL_Y_FROM_CXY( cxy ) & ~mask;
     1245
     1246        // get child_cxy and child_lid for up to 4 children threads : 00 / 01 / 10 / 11
     1247        for (x = 0 ; x < 2 ; x++)
     1248        {
     1249            // compute child macro-cluster X coordinate
     1250            child_x = (x == 0) ? parent_x : (parent_x + half);
     1251
     1252            for (y = 0 ; y < 2 ; y++)
     1253            {
     1254                // compute child macro-cluster Y coordinate
     1255                child_y = (y == 0) ? parent_y : (parent_y + half);
     1256
     1257                // select the best core in macro-cluster
     1258                child_sts[x][y] = get_best_core( HAL_CXY_FROM_XY( child_x , child_y ),
     1259                                                 level-1,
     1260                                                 &child_cxy[x][y],
     1261                                                 &child_lid[x][y] );
     1262
     1263                if( child_sts[x][y] < 0 )  // failure => report error
     1264                {
     1265                    printf("\n[ERROR] in %s : illegal arguments for <build> thread[%x,%x]\n",
     1266                    __FUNCTION__ , cxy , level );
     1267
     1268                    // report error to parent
     1269                    build_args[parent_cxy][level+1].error = 1;
     1270                }
     1271                else if (child_sts[x][y] > 0 )  // macro-cluster undefined => does nothing
     1272                {
     1273                }
     1274                else                            // core found
     1275                {
     1276                    nb_children++;
     1277                }
     1278            }  // end for y
     1279        }  // end for x
     1280
     1281        // kill process if no active core in cluster
     1282        // TODO this "if" should be replaced by an "assert" [AG]
     1283        if( nb_children == 0 )
     1284        {
     1285            printf("\n[PANIC] in %s : no active core in macro cluster [%x,%d]\n",
     1286            __FUNCTION__ , cxy , level );
     1287
     1288            // report error to parent
     1289            build_args[parent_cxy][level+1].error = 1;
     1290
     1291            // kill process
     1292            exit( EXIT_FAILURE );
     1293        }
     1294
     1295        // initialize the barrier for (nb_children + 1)
     1296        if( pthread_barrier_init( &build_barrier[cxy][level], NULL , nb_children + 1 ) )
     1297        {
     1298            printf("\n[error] in %s : cannot initialise barrier for build thread[%x][%d]\n",
     1299            __FUNCTION__ , cxy , level );
     1300
     1301            // report error to parent
     1302            build_args[parent_cxy][level+1].error = 1;
     1303        }
     1304
     1305#if DEBUG_PTHREAD_PARALLEL
     1306printf("\n[%s] <build> thread[%x][%d] created barrier / %d children\n",
     1307__FUNCTION__, cxy, level, nb_children + 1 );
     1308#endif
     1309        // create 1 to 4 children threads
     1310        for (x = 0 ; x < 2 ; x++)
     1311        {
     1312            for (y = 0 ; y < 2 ; y++)
     1313            {
     1314                // thread is created only if macro-cluster is active
     1315                if( child_sts[x][y] == 0 )
     1316                {
     1317                    unsigned int tgt_cxy = child_cxy[x][y];
     1318                    unsigned int tgt_lid = child_lid[x][y];
     1319
     1320                    // set child thread attributes
     1321                    build_attr[tgt_cxy][level-1].attributes = PT_ATTR_DETACH |
     1322                                                              PT_ATTR_CLUSTER_DEFINED |
     1323                                                              PT_ATTR_CORE_DEFINED;
     1324                    build_attr[tgt_cxy][level-1].cxy        = tgt_cxy;
     1325                    build_attr[tgt_cxy][level-1].lid        = tgt_lid;
     1326
     1327                    // propagate build function arguments
     1328                    build_args[tgt_cxy][level-1].cxy                   = child_cxy[x][y];
     1329                    build_args[tgt_cxy][level-1].level                 = level-1;
     1330                    build_args[tgt_cxy][level-1].parent_cxy            = cxy;
     1331                    build_args[tgt_cxy][level-1].root_level            = root_level;
     1332                    build_args[tgt_cxy][level-1].work_func             = work_func;
     1333                    build_args[tgt_cxy][level-1].work_args_array       = work_args_array;
     1334                    build_args[tgt_cxy][level-1].parent_barriers_array = parent_barriers_array;
     1335                   
     1336                    // create thread
     1337                    if( pthread_create( &trdid,                         
     1338                                        &build_attr[tgt_cxy][level-1],   
     1339                                        &pthread_recursive_build,                         
     1340                                        &build_args[tgt_cxy][level-1] ) )
     1341                    {
     1342                        printf("\n[ERROR] in %s : cannot create build thread[%x][%d]\n",
     1343                        __FUNCTION__ , child_cxy , level -1 );
     1344
     1345                        // report error to parent
     1346                        build_args[parent_cxy][level+1].error = 1;
     1347                    }
     1348
     1349#if DEBUG_PTHREAD_PARALLEL
     1350printf("\n[%s] <build> thread[%x][%d] created <build> thread[%x][%d] on core[%x,%d]\n",
     1351__FUNCTION__, cxy, level, tgt_cxy, level - 1, tgt_cxy, tgt_lid );
     1352#endif
     1353                }  //end if sts[x][y]
     1354            }  // end for y
     1355        }  // end for x
     1356       
     1357        // wait on barrier until "build" children threads completed
     1358        if( pthread_barrier_wait( &build_barrier[cxy][level] ) )
     1359        {
     1360            printf("\n[ERROR] in %s / first barrier for <build> thread[%x][%d]\n",
     1361            __FUNCTION__ , cxy , level );
     1362
     1363            // report error to parent
     1364            build_args[parent_cxy][level+1].error = 1;
     1365        }
     1366
     1367#if DEBUG_PTHREAD_PARALLEL
     1368printf("\n[%s] <build> thread[%x][%d] resume after children completion\n",
     1369__FUNCTION__, cxy, level );
     1370#endif
     1371
     1372    }  // end level > 0
     1373
     1374    // report error to parent when required
     1375    if( build_args[cxy][level].error )
     1376    {
     1377        build_args[parent_cxy][level+1].error = 1;
     1378    }
     1379
     1380    // all <build> threads - but the root -
     1381    // signal completion to parent thread and exit
     1382    if( level < root_level )
     1383    {
     1384        if( pthread_barrier_wait( &build_barrier[parent_cxy][level+1] ) )
     1385        {
     1386            printf("\n[ERROR] in %s / second barrier for <build> thread[%x][%d]\n",
     1387            __FUNCTION__ , cxy , level );
     1388
     1389            // report error to parent
     1390            build_args[parent_cxy][level+1].error = 1;
     1391        }
     1392   
     1393#if DEBUG_PTHREAD_PARALLEL
     1394printf("\n[%s] <build> thread[%x][%d] exit\n",
     1395__FUNCTION__, cxy , level );
     1396#endif
     1397        // "build" thread exit
     1398        pthread_exit( NULL );
     1399    }
     1400}  // end pthread_recursive_build()
     1401
     1402///////////////////////////////////////////////////////
     1403int pthread_parallel_create( unsigned int   root_level,
     1404                             void         * work_func,
     1405                             void         * work_args_array,
     1406                             void         * parent_barriers_array )
     1407{
     1408    unsigned int   root_cxy;
     1409    unsigned int   root_lid;    // unused, but required by get_core_id()
     1410   
     1411#if DEBUG_PTHREAD_PARALLEL
     1412printf("\n[%s] enter / root_level %d / func %x / args %x / barriers %x\n",
     1413__FUNCTION__, root_level, work_func, work_args_array, parent_barriers_array );
     1414#endif
     1415
     1416    // get calling thread cluster
     1417    get_core_id( &root_cxy , &root_lid );
     1418
     1419    // set the build function arguments for the root <build> thread
     1420    build_args[root_cxy][root_level].cxy                   = root_cxy;
     1421    build_args[root_cxy][root_level].level                 = root_level;
     1422    build_args[root_cxy][root_level].root_level            = root_level;
     1423    build_args[root_cxy][root_level].work_func             = work_func;
     1424    build_args[root_cxy][root_level].work_args_array       = work_args_array;
     1425    build_args[root_cxy][root_level].parent_barriers_array = parent_barriers_array;
     1426   
     1427    // call the recursive build function
     1428    pthread_recursive_build( &build_args[root_cxy][root_level] );
     1429
     1430    // check error
     1431    if( build_args[root_cxy][root_level].error )
     1432    {
     1433        printf("\n[error] in  %s\n", __FUNCTION__ );
     1434        return -1;
     1435    }
     1436
     1437    return 0;
     1438
     1439}  // end pthread_parallel_create()
     1440
     1441
    10601442
    10611443// Local Variables:
  • trunk/libs/libalmosmkh/almosmkh.h

    r629 r637  
    22 * almosmkh.h - User level ALMOS-MKH specific library definition.
    33 *
    4  * Author     Alain Greiner (2016,2017,2018)
     4 * Author     Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
     
    7272
    7373/***************************************************************************************
    74  * This syscall returns the cluster an local index for the calling core.
     74 * This syscall returns the cluster identifier and the local index
     75 * for the calling core.
    7576 ***************************************************************************************
    7677 * @ cxy      : [out] cluster identifier.
     
    7879 * @ return always 0.
    7980 **************************************************************************************/
    80 int get_core( unsigned int * cxy,
    81               unsigned int * lid );
     81int get_core_id( unsigned int * cxy,
     82                 unsigned int * lid );
     83
     84/***************************************************************************************
     85 * This syscall returns the number of cores in a given cluster.
     86 ***************************************************************************************
     87 * @ cxy      : [in]  target cluster identifier.
     88 * @ ncores   : [out] number of cores in target cluster.
     89 * @ return always 0.
     90 **************************************************************************************/
     91int get_nb_cores( unsigned int   cxy,
     92                  unsigned int * ncores );
     93
     94/***************************************************************************************
     95 * This syscall uses the DQDT to search, in a macro-cluster specified by the
     96 * <cxy_base> and <level> arguments arguments, the core with the lowest load.
     97 * it writes in the <cxy> and <lid> buffers the selected core cluster identifier
     98 * and the local core index.
     99 ***************************************************************************************
     100 * @ cxy_base : [in]  any cluster identifier in macro-cluster.in clusters array.
     101 * @ level    : [in]  macro-cluster level in [1,2,3,4,5].
     102 * @ cxy      : [out] selected core cluster identifier.
     103 * @ lid      : [out] selectod core local index.
     104 * @ return 0 if success / 1 if no core in macro-cluster / -1 if illegal arguments.
     105 **************************************************************************************/
     106int get_best_core( unsigned int   cxy_base,
     107                   unsigned int   level,
     108                   unsigned int * cxy,
     109                   unsigned int * lid );
    82110
    83111/***************************************************************************************
    84  * This function returns the calling core cycles counter,
     112 * This function returns the value contained in the calling core cycles counter,
    85113 * taking into account a possible overflow on 32 bits architectures.
    86114 ***************************************************************************************
     
    414442                      unsigned int cxy );
    415443
     444/********* Non standard (ALMOS-MKH specific) pthread_parallel_create() syscall  *********/
     445
     446//////////////////////////////////////////////////////////////////////////////////////////
     447// This system call can be used to parallelize the creation and the termination
     448// of a parallel multi-threaded application. It removes the loop in the main thread that
     449// creates the N working threads (N  sequencial pthread_create() ). It also removes the
     450// loop that waits completion of these N working threads (N sequencial pthread_join() ).
     451// It creates one "work" thread (in detached mode) per core in the target architecture.
     452// Each "work" thread is identified by the [cxy][lid] indexes (cluster / local core).
     453// The pthread_parallel_create() function returns only when all "work" threads completed
     454// (successfully or not).
     455//
     456// To use this system call, the application code must define the following structures:
     457// - To define the arguments to pass to the <work> function the application must allocate
     458//   and initialize a first 2D array, indexed by [cxy] and [lid] indexes, where each slot
     459//   contains an application specific structure, and another 2D array, indexed by the same
     460//   indexes, containing pointers on these structures. This array of pointers is one
     461//   argument of the pthread_parallel_create() function.
     462// - To detect the completion of the <work> threads, the application must allocate a 1D
     463//   array, indexed by the cluster index [cxy], where each slot contains a pthread_barrier
     464//   descriptor. This barrier is initialised by the pthread_parallel_create() function,
     465//   in all cluster containing at least one work thread. This array of barriers is another
     466//   argument of the pthread_parallel_create() function.
     467//
     468// Implementation note:
     469// To parallelize the "work" threads creation and termination, the pthread_parallel_create()
     470// function creates a distributed quad-tree (DQT) of "build" threads covering all cores
     471// required to execute the parallel application.
     472// Depending on the hardware topology, this DQT can be truncated, (i.e. some
     473// parent nodes can have less than 4 chidren), if (x_size != y_size), or if one size
     474// is not a power of 2. Each "build" thread is identified by two indexes [cxy][level].
     475// Each "build" thread makes the following tasks:
     476// 1) It calls the pthread_create() function to create up to 4 children threads, that
     477//    are are "work" threads when (level == 0), or "build" threads, when (level > 0).
     478// 2) It initializes the barrier (global variable), used to block/unblock
     479//    the parent thread until children completion.
     480// 3) It calls the pthread_barrier_wait( self ) to wait until all children threads
     481//    completed (successfully or not).
     482// 4) It calls the pthread_barrier_wait( parent ) to unblock the parent thread.
     483//////////////////////////////////////////////////////////////////////////////////////////
     484
     485/*****************************************************************************************
     486 * This blocking function creates N working threads that execute the code defined
     487 * by the <work_func> and <work_args> arguments.
     488 * The number N of created threads is entirely defined by the <root_level> argument.
     489 * This value defines an abstract quad-tree, with a square base : level in [0,1,2,3,4],
     490 * side in [1,2,4,8,16], nclusters in [1,4,16,64,256]. This base is called  macro_cluster.
     491 * A working thread is created on all cores contained in the specified macro-cluster.
     492 * The actual number of physical clusters containing cores can be smaller than the number
     493 * of clusters covered by the quad tree. The actual number of cores in a cluster can be
     494 * less than the max value.
     495 *
     496 * In the current implementation, all threads execute the same <work_func> function,
     497 * on different arguments, that are specified as a 2D array of pointers <work_args>.
     498 * This can be modified in a future version, where the <work_func> argument can become
     499 * a 2D array of pointers, to have one specific function for each thread.
     500 *****************************************************************************************
     501 * @ root_level            : [in]  DQT root level in [0,1,2,3,4].
     502 * @ work_func             : [in]  pointer on start function.
     503 * @ work_args_array       : [in]  pointer on a 2D array of pointers.
     504 * @ parent_barriers_array : [in]  pointer on a 1D array of barriers.
     505 * @ return 0 if success / return -1 if failure.
     506 ****************************************************************************************/
     507int pthread_parallel_create( unsigned int   root_level,
     508                             void         * work_func,
     509                             void         * work_args_array,
     510                             void         * parent_barriers_array );
     511
    416512#endif /* _LIBALMOSMKH_H_ */
    417513
  • trunk/libs/libpthread/pthread.c

    r619 r637  
    230230
    231231////////////////////////////////////////////////////////////////////////////////////////////
    232 // The following functions define another implementation for the POSX barrier
    233 // based on a distributed quadtree implemented in user space, and relying
    234 // on a busy waiting policy.
    235 ////////////////////////////////////////////////////////////////////////////////////////////
    236 
    237 
    238 ////////////////////////////////////////////////////////////////////////////////////////////
    239 // This recursive function initializes the SQT nodes
    240 // traversing the SQT from root to bottom
    241 ////////////////////////////////////////////////////////////////////////////////////////////
    242 static void sqt_barrier_build( pthread_barrier_t  * barrier,
     232// The following functions define another implementation for the POSX barrier, based on
     233// a distributed quad tree implemented in user space, but using a busy waiting policy.
     234////////////////////////////////////////////////////////////////////////////////////////////
     235
     236
     237////////////////////////////////////////////////////////////////////////////////////////////
     238// This recursive function initializes the DQT nodes traversing the SQT from root to bottom
     239////////////////////////////////////////////////////////////////////////////////////////////
     240static void dqt_barrier_build( pthread_barrier_t  * barrier,
    243241                               unsigned int         x,
    244242                               unsigned int         y,
    245243                               unsigned int         level,
    246                                sqt_node_t         * parent,
     244                               dqt_node_t         * parent,
    247245                               unsigned int         x_size,
    248246                               unsigned int         y_size,
     
    250248{
    251249    // get target node address
    252     sqt_node_t * node = barrier->node[x][y][level];
     250    dqt_node_t * node = barrier->node[x][y][level];
    253251   
    254252    if (level == 0 )        // terminal case
     
    266264
    267265#if PTHREAD_BARRIER_DEBUG
    268 printf("\n[BARRIER] %s : sqt_node[%d][%d][%d] / arity %d / desc %x\n"
     266printf("\n[BARRIER] %s : dqt_node[%d][%d][%d] / arity %d / desc %x\n"
    269267"parent %x / child0 %x / child1 %x / child2 %x / child3 %x\n",
    270268__FUNCTION__, x, y, level, node->arity, node, node->parent,
     
    312310
    313311#if PTHREAD_BARRIER_DEBUG
    314 printf("\n[BARRIER] %s : sqt_node[%d][%d][%d] / arity %d / desc %x\n"
     312printf("\n[BARRIER] %s : dqt_node[%d][%d][%d] / arity %d / desc %x\n"
    315313"parent %x / child0 %x / child1 %x / child2 %x / child3 %x\n",
    316314__FUNCTION__, x, y, level, node->arity, node, node->parent,
     
    322320        {
    323321            if ( (cx[i] < x_size) && (cy[i] < y_size) )
    324             sqt_barrier_build( barrier,
     322            dqt_barrier_build( barrier,
    325323                               cx[i],
    326324                               cy[i],
     
    332330        }
    333331    }
    334 }  // end sqt_barrier_build()
     332}  // end dqt_barrier_build()
    335333
    336334////////////////////////////////////////////////////////////////
     
    394392                     ( (l == 4) && ((x&0x0F) == 0) && ((y&0x0F) == 0) ) )
    395393                 {
    396                      sqt_node_t * node = remote_malloc( sizeof(sqt_node_t) , cxy );
     394                     dqt_node_t * node = remote_malloc( sizeof(dqt_node_t) , cxy );
    397395
    398396                     if( node == NULL )
    399397                     {
    400                          printf("\n[ERROR] in %s : cannot allocate sqt_node in cluster %x\n",
     398                         printf("\n[ERROR] in %s : cannot allocate dqt_node in cluster %x\n",
    401399                         __FUNCTION__ , cxy );
    402400                         return -1;
     
    411409           
    412410    // recursively initialize all SQT nodes from root to bottom
    413     sqt_barrier_build( barrier,
     411    dqt_barrier_build( barrier,
    414412                       0,       
    415413                       0,
     
    428426//////////////////////////////////////////////////////////////////////////////////////////
    429427// This recursive function decrements the distributed "count" variables,
    430 // traversing the SQT from bottom to root.
     428// traversing the DQT from bottom to root.
    431429// The last arrived thread reset the local node before returning.
    432430//////////////////////////////////////////////////////////////////////////////////////////
    433 static void sqt_barrier_decrement( sqt_node_t * node )
     431static void dqt_barrier_decrement( dqt_node_t * node )
    434432{
    435433
     
    457455    {
    458456        // decrement the parent node if the current node is not the root
    459         if ( node->parent != NULL )  sqt_barrier_decrement( node->parent );
     457        if ( node->parent != NULL )  dqt_barrier_decrement( node->parent );
    460458
    461459#if PTHREAD_BARRIER_DEBUG
     
    484482        return;
    485483    }
    486 } // end sqt_barrier_decrement()
     484} // end dqt_barrier_decrement()
    487485   
    488486///////////////////////////////////////////////////////
     
    504502
    505503    // recursively decrement count from bottom to root
    506     sqt_barrier_decrement( barrier->node[x][y][0] );
     504    dqt_barrier_decrement( barrier->node[x][y][0] );
    507505
    508506    hal_user_fence();
  • trunk/libs/libpthread/pthread.h

    r632 r637  
    22 * pthread.h - User level <pthread> library definition.
    33 *
    4  * Author     Alain Greiner (2016,2017,2018)
     4 * Author     Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
  • trunk/libs/mini-libc/stdio.h

    r623 r637  
    22 * stdio.h - User level <stdio> library definition.
    33 *
    4  * Author     Alain Greiner (2016,2017,2018)
     4 * Author     Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
  • trunk/libs/mini-libc/stdlib.c

    r589 r637  
    148148void * malloc( unsigned int size )
    149149{
    150     // get cluster identifier
    151     unsigned int cxy;
    152     unsigned int lid;
    153     get_core( &cxy , &lid );
     150    unsigned int cxy;
     151    unsigned int lid;
     152
     153    // get cluster identifier
     154    get_core_id( &cxy , &lid );
    154155
    155156    return remote_malloc( size, cxy );
     
    160161                unsigned int size )
    161162{
    162     // get calling core cluster identifier
    163     unsigned int cxy;
    164     unsigned int lid;
    165     get_core( &cxy , &lid );
     163    unsigned int cxy;
     164    unsigned int lid;
     165
     166    // get cluster identifier
     167    get_core_id( &cxy , &lid );
    166168
    167169    return remote_calloc( count , size , cxy );
     
    172174                 unsigned int  size )
    173175{
    174     // get calling core cluster identifier
    175     unsigned int cxy;
    176     unsigned int lid;
    177     get_core( &cxy , &lid );
     176    unsigned int cxy;
     177    unsigned int lid;
     178
     179    // get cluster identifier
     180    get_core_id( &cxy , &lid );
    178181
    179182    return remote_realloc( ptr , size , cxy );
     
    183186void free( void * ptr )
    184187{
    185     // get calling core cluster identifier
    186     unsigned int cxy;
    187     unsigned int lid;
    188     get_core( &cxy , &lid );
     188    unsigned int cxy;
     189    unsigned int lid;
     190
     191    // get cluster identifier
     192    get_core_id( &cxy , &lid );
    189193
    190194    remote_free( ptr , cxy );
Note: See TracChangeset for help on using the changeset viewer.