/* * remote_barrier.c - POSIX barrier implementation. * * Author Alain Greiner (2016,2017,2018,2019,2020) * * Copyright (c) UPMC Sorbonne Universites * * This file is part of ALMOS-MKH. * * ALMOS-MKH is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2.0 of the License. * * ALMOS-MKH is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with ALMOS-MKH; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include //////////////////////////////////////////////////// // generic (implementation independant) functions //////////////////////////////////////////////////// /////////////////////////////////////////////////// xptr_t generic_barrier_from_ident( intptr_t ident ) { // get pointer on local process_descriptor process_t * process = CURRENT_THREAD->process; // get pointers on reference process xptr_t ref_xp = process->ref_xp; cxy_t ref_cxy = GET_CXY( ref_xp ); process_t * ref_ptr = (process_t *)GET_PTR( ref_xp ); // get extended pointer on root of barriers list xptr_t root_xp = XPTR( ref_cxy , &ref_ptr->barrier_root ); // scan reference process barriers list xptr_t iter_xp; xptr_t barrier_xp; cxy_t barrier_cxy; generic_barrier_t * barrier_ptr; intptr_t current; bool_t found = false; XLIST_FOREACH( root_xp , iter_xp ) { barrier_xp = XLIST_ELEMENT( iter_xp , generic_barrier_t , list ); barrier_cxy = GET_CXY( barrier_xp ); barrier_ptr = (generic_barrier_t *)GET_PTR( barrier_xp ); current = (intptr_t)hal_remote_lpt( XPTR( barrier_cxy , &barrier_ptr->ident ) ); if( ident == current ) { found = true; break; } } if( found == false ) return XPTR_NULL; else return barrier_xp; } // end generic_barrier_from_ident() ////////////////////////////////////////////////////////////// error_t generic_barrier_create( intptr_t ident, uint32_t count, pthread_barrierattr_t * attr ) { generic_barrier_t * gen_barrier_ptr; // local pointer on generic barrier descriptor void * barrier; // local pointer on impl barrier descriptor // get pointer on local process_descriptor process_t * process = CURRENT_THREAD->process; // get pointers on reference process xptr_t ref_xp = process->ref_xp; cxy_t ref_cxy = GET_CXY( ref_xp ); process_t * ref_ptr = (process_t *)GET_PTR( ref_xp ); // allocate memory for generic barrier descriptor gen_barrier_ptr = kmem_remote_alloc( ref_cxy, bits_log2(sizeof(generic_barrier_t)), AF_KERNEL ); if( gen_barrier_ptr == NULL ) { printk("\n[ERROR] in %s : cannot create generic barrier\n", __FUNCTION__ ); return -1; } // create implementation specific barrier descriptor if( attr == NULL ) // simple barrier { // create simple barrier descriptor barrier = simple_barrier_create( count ); } else // QDT barrier { uint32_t x_size = attr->x_size; uint32_t y_size = attr->y_size; uint32_t nthreads = attr->nthreads; // check attributes / count if( (x_size * y_size * nthreads) != count ) { printk("\n[ERROR] in %s : count(%d) != x_size(%d) * y_size(%d) * nthreads(%d)\n", __FUNCTION__, count, x_size, y_size, nthreads ); kmem_remote_free( ref_cxy, gen_barrier_ptr, bits_log2(sizeof(generic_barrier_t)) ); return -1; } // create DQT barrier descriptor barrier = dqt_barrier_create( x_size , y_size , nthreads ); } if( barrier == NULL ) { printk("\n[ERROR] in %s : cannot create impl barrier\n", __FUNCTION__ ); kmem_remote_free( ref_cxy, gen_barrier_ptr, bits_log2(sizeof(generic_barrier_t)) ); return -1; } // initialize the generic barrier descriptor hal_remote_spt( XPTR( ref_cxy , &gen_barrier_ptr->ident ) , (void*)ident ); hal_remote_s32( XPTR( ref_cxy , &gen_barrier_ptr->is_dqt ) , (attr != NULL) ); hal_remote_spt( XPTR( ref_cxy , &gen_barrier_ptr->extend ) , barrier ); // build extended pointers on lock, root and entry for reference process xlist xptr_t root_xp = XPTR( ref_cxy , &ref_ptr->barrier_root ); xptr_t lock_xp = XPTR( ref_cxy , &ref_ptr->sync_lock ); xptr_t entry_xp = XPTR( ref_cxy , &gen_barrier_ptr->list ); // register barrier in reference process xlist of barriers remote_busylock_acquire( lock_xp ); xlist_add_first( root_xp , entry_xp ); remote_busylock_release( lock_xp ); return 0; } // en generic_barrier_create() ///////////////////////////////////////////////////// void generic_barrier_destroy( xptr_t gen_barrier_xp ) { // get pointer on local process_descriptor process_t * process = CURRENT_THREAD->process; // get pointers on reference process xptr_t ref_xp = process->ref_xp; cxy_t ref_cxy = GET_CXY( ref_xp ); process_t * ref_ptr = GET_PTR( ref_xp ); // get cluster and local pointer on generic barrier descriptor generic_barrier_t * gen_barrier_ptr = GET_PTR( gen_barrier_xp ); cxy_t gen_barrier_cxy = GET_CXY( gen_barrier_xp ); // get barrier type and extension pointer bool_t is_dqt = hal_remote_l32( XPTR( gen_barrier_cxy , &gen_barrier_ptr->is_dqt ) ); void * extend = hal_remote_lpt( XPTR( gen_barrier_cxy , &gen_barrier_ptr->extend ) ); // build extended pointer on implementation dependant barrier descriptor xptr_t barrier_xp = XPTR( gen_barrier_cxy , extend ); // delete the implementation specific barrier if( is_dqt ) dqt_barrier_destroy( barrier_xp ); else simple_barrier_destroy( barrier_xp ); // build extended pointers on lock and entry for reference process xlist xptr_t lock_xp = XPTR( ref_cxy , &ref_ptr->sync_lock ); xptr_t entry_xp = XPTR( gen_barrier_cxy , &gen_barrier_ptr->list ); // remove barrier from reference process xlist remote_busylock_acquire( lock_xp ); xlist_unlink( entry_xp ); remote_busylock_release( lock_xp ); // release memory allocated to generic barrier descriptor kmem_remote_free( gen_barrier_cxy, gen_barrier_ptr, bits_log2(sizeof(generic_barrier_t)) ); } // end generic_barrier_destroy() ////////////////////////////////////////////////// void generic_barrier_wait( xptr_t gen_barrier_xp ) { // get generic barrier descriptor cluster and pointer cxy_t gen_barrier_cxy = GET_CXY( gen_barrier_xp ); generic_barrier_t * gen_barrier_ptr = GET_PTR( gen_barrier_xp ); // get implementation type and extend local pointer bool_t is_dqt = hal_remote_l32( XPTR( gen_barrier_cxy , &gen_barrier_ptr->is_dqt ) ); void * extend = hal_remote_lpt( XPTR( gen_barrier_cxy , &gen_barrier_ptr->extend ) ); // build extended pointer on implementation specific barrier descriptor xptr_t barrier_xp = XPTR( gen_barrier_cxy , extend ); // call the relevant wait function if( is_dqt ) dqt_barrier_wait( barrier_xp ); else simple_barrier_wait( barrier_xp ); } // end generic_barrier_wait() ///////////////////////////////////////////////////// void generic_barrier_display( xptr_t gen_barrier_xp ) { // get cluster and local pointer generic_barrier_t * gen_barrier_ptr = GET_PTR( gen_barrier_xp ); cxy_t gen_barrier_cxy = GET_CXY( gen_barrier_xp ); // get barrier type and extend pointer bool_t is_dqt = hal_remote_l32( XPTR( gen_barrier_cxy , &gen_barrier_ptr->is_dqt ) ); void * extend = hal_remote_lpt( XPTR( gen_barrier_cxy , &gen_barrier_ptr->extend ) ); // buil extended pointer on the implementation specific barrier descriptor xptr_t barrier_xp = XPTR( gen_barrier_cxy , extend ); // display barrier state if( is_dqt ) dqt_barrier_display( barrier_xp ); else simple_barrier_display( barrier_xp ); } ///////////////////////////////////////////////////////////// // simple barrier functions ///////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////// simple_barrier_t * simple_barrier_create( uint32_t count ) { simple_barrier_t * barrier; // get pointer on local client process descriptor thread_t * this = CURRENT_THREAD; process_t * process = this->process; // get reference process cluster xptr_t ref_xp = process->ref_xp; cxy_t ref_cxy = GET_CXY( ref_xp ); // allocate memory for simple barrier descriptor barrier = kmem_remote_alloc( ref_cxy, bits_log2(sizeof(simple_barrier_t)), AF_ZERO ); if( barrier == NULL ) { printk("\n[ERROR] in %s : cannot create simple barrier\n", __FUNCTION__ ); return NULL; } // initialise simple barrier descriptor hal_remote_s32 ( XPTR( ref_cxy , &barrier->arity ) , count ); hal_remote_s32 ( XPTR( ref_cxy , &barrier->current ) , 0 ); hal_remote_s32 ( XPTR( ref_cxy , &barrier->sense ) , 0 ); xlist_root_init ( XPTR( ref_cxy , &barrier->root ) ); remote_busylock_init( XPTR( ref_cxy , &barrier->lock ) , LOCK_BARRIER_STATE ); #if DEBUG_BARRIER_CREATE uint32_t cycle = (uint32_t)hal_get_cycles(); if( cycle > DEBUG_BARRIER_CREATE ) printk("\n[%s] thread[%x,%x] created barrier (%x,%x) / count %d / cycle %d\n", __FUNCTION__, process->pid, this->trdid, ref_cxy, barrier, count, cycle ); #endif return barrier; } // end simple_barrier_create() //////////////////////////////////////////////// void simple_barrier_destroy( xptr_t barrier_xp ) { // get barrier cluster and local pointer cxy_t barrier_cxy = GET_CXY( barrier_xp ); simple_barrier_t * barrier_ptr = GET_PTR( barrier_xp ); // release memory allocated for barrier descriptor kmem_remote_free( barrier_cxy, barrier_ptr, bits_log2(sizeof(simple_barrier_t)) ); #if DEBUG_BARRIER_DESTROY uint32_t cycle = (uint32_t)hal_get_cycles(); thread_t * this = CURRENT_THREAD; process_t * process = this->process; if( cycle > DEBUG_BARRIER_DESTROY ) printk("\n[%s] thread[%x,%x] deleted barrier (%x,%x) / cycle %d\n", __FUNCTION__, process->pid, this->trdid, barrier_ptr, barrier_cxy, cycle ); #endif } // end simple_barrier_destroy() ///////////////////////////////////////////// void simple_barrier_wait( xptr_t barrier_xp ) { uint32_t expected; uint32_t sense; uint32_t current; uint32_t arity; xptr_t root_xp; xptr_t lock_xp; xptr_t current_xp; xptr_t sense_xp; xptr_t arity_xp; // get pointer on calling thread thread_t * this = CURRENT_THREAD; // check calling thread can yield thread_assert_can_yield( this , __FUNCTION__ ); // get cluster and local pointer on remote barrier simple_barrier_t * barrier_ptr = GET_PTR( barrier_xp ); cxy_t barrier_cxy = GET_CXY( barrier_xp ); #if DEBUG_BARRIER_WAIT uint32_t cycle = (uint32_t)hal_get_cycles(); if( cycle > DEBUG_BARRIER_WAIT ) printk("\n[%s] thread[%x,%x] enter / barrier (%x,%x) / cycle %d\n", __FUNCTION__, this->process->pid, this->trdid, barrier_cxy, barrier_ptr, cycle ); #endif // build extended pointers on various barrier descriptor fields lock_xp = XPTR( barrier_cxy , &barrier_ptr->lock ); root_xp = XPTR( barrier_cxy , &barrier_ptr->root ); current_xp = XPTR( barrier_cxy , &barrier_ptr->current ); sense_xp = XPTR( barrier_cxy , &barrier_ptr->sense ); arity_xp = XPTR( barrier_cxy , &barrier_ptr->arity ); // take busylock protecting the barrier state remote_busylock_acquire( lock_xp ); // get sense and threads values from barrier descriptor sense = hal_remote_l32( sense_xp ); arity = hal_remote_l32( arity_xp ); // compute expected value if ( sense == 0 ) expected = 1; else expected = 0; // increment current number of arrived threads / get value before increment current = hal_remote_atomic_add( current_xp , 1 ); // last thread reset current, toggle sense, and activate all waiting threads // other threads block, register in queue, and deschedule if( current == (arity - 1) ) // last thread { hal_remote_s32( current_xp , 0 ); hal_remote_s32( sense_xp , expected ); // unblock all waiting threads while( xlist_is_empty( root_xp ) == false ) { // get pointers on first waiting thread xptr_t thread_xp = XLIST_FIRST( root_xp , thread_t , wait_list ); cxy_t thread_cxy = GET_CXY( thread_xp ); thread_t * thread_ptr = GET_PTR( thread_xp ); #if (DEBUG_BARRIER_WAIT & 1) trdid_t trdid = hal_remote_l32( XPTR( thread_cxy , &thread_ptr->trdid ) ); process_t * process = hal_remote_lpt( XPTR( thread_cxy , &thread_ptr->process ) ); pid_t pid = hal_remote_l32( XPTR( thread_cxy , &process->pid ) ); if( cycle > DEBUG_BARRIER_WAIT ) printk("\n[%s] thread[%x,%x] unblocks thread[%x,%x]\n", __FUNCTION__, this->process->pid, this->trdid, pid, trdid ); #endif // remove waiting thread from queue xlist_unlink( XPTR( thread_cxy , &thread_ptr->wait_list ) ); // unblock waiting thread thread_unblock( thread_xp , THREAD_BLOCKED_USERSYNC ); } // release busylock protecting the barrier remote_busylock_release( lock_xp ); } else // not the last thread { #if (DEBUG_BARRIER_WAIT & 1) if( cycle > DEBUG_BARRIER_WAIT ) printk("\n[%s] thread[%x,%x] blocks\n", __FUNCTION__, this->process->pid, this->trdid ); #endif // register calling thread in barrier waiting queue xlist_add_last( root_xp , XPTR( local_cxy , &this->wait_list ) ); // block calling thread thread_block( XPTR( local_cxy , this ) , THREAD_BLOCKED_USERSYNC ); // release busylock protecting the remote_barrier remote_busylock_release( lock_xp ); // deschedule sched_yield("blocked on barrier"); } #if DEBUG_BARRIER_WAIT cycle = (uint32_t)hal_get_cycles(); if( cycle > DEBUG_BARRIER_WAIT ) printk("\n[%s] thread[%x,%x] exit / barrier (%x,%x) / cycle %d\n", __FUNCTION__, this->process->pid, this->trdid, barrier_cxy, barrier_ptr, cycle ); #endif } // end simple_barrier_wait() ///////////////////////////////////////////////// void simple_barrier_display( xptr_t barrier_xp ) { // get cluster and local pointer on simple barrier simple_barrier_t * barrier_ptr = GET_PTR( barrier_xp ); cxy_t barrier_cxy = GET_CXY( barrier_xp ); // get barrier global parameters uint32_t current = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->current ) ); uint32_t arity = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->arity ) ); printk("\n***** simple barrier : %d arrived threads on %d *****\n", current, arity ); } // end simple_barrier_display() ///////////////////////////////////////////////////////////// // DQT barrier functions ///////////////////////////////////////////////////////////// static void dqt_barrier_increment( xptr_t node_xp ); #if DEBUG_BARRIER_CREATE void dqt_barrier_display( xptr_t barrier_xp ); #endif /////////////////////////////////////////////////////// dqt_barrier_t * dqt_barrier_create( uint32_t x_size, uint32_t y_size, uint32_t nthreads ) { dqt_barrier_t * barrier; // local pointer on DQT barrier descriptor xptr_t barrier_xp; // extended pointer on DQT barrier descriptor uint32_t z; // actual DQT size == max(x_size,y_size) uint32_t levels; // actual number of DQT levels uint32_t x; // X coordinate in QDT mesh uint32_t y; // Y coordinate in QDT mesh uint32_t l; // level coordinate // compute number of DQT levels, depending on the mesh size z = (x_size > y_size) ? x_size : y_size; levels = (z < 2) ? 1 : (z < 3) ? 2 : (z < 5) ? 3 : (z < 9) ? 4 : 5; // check x_size and y_size arguments assert( __FUNCTION__, (z <= 16), "DQT mesh size larger than (16*16)\n"); // check size of an array of 5 DQT nodes assert( __FUNCTION__, (sizeof(dqt_node_t) * 5 <= 512 ), "array of DQT nodes larger than 512 bytes\n"); // check size of DQT barrier descriptor assert( __FUNCTION__, (sizeof(dqt_barrier_t) <= 0x4000 ), "DQT barrier descriptor larger than 4 pages\n"); // get pointer on client thread and process descriptors thread_t * this = CURRENT_THREAD; process_t * process = this->process; #if DEBUG_BARRIER_CREATE uint32_t cycle = (uint32_t)hal_get_cycles(); if( cycle > DEBUG_BARRIER_CREATE ) printk("\n[%s] thread[%x,%x] enter : x_size %d / y_size %d / levels %d / cycle %d\n", __FUNCTION__, process->pid, this->trdid, x_size, y_size, levels, cycle ); #endif // get reference process cluster xptr_t ref_xp = process->ref_xp; cxy_t ref_cxy = GET_CXY( ref_xp ); // 1. allocate 4 small pages for the DQT barrier descriptor in reference cluster barrier = kmem_remote_alloc( ref_cxy, CONFIG_PPM_PAGE_ORDER + 2, // 4 small pages AF_ZERO ); if( barrier == NULL ) { printk("\n[ERROR] in %s : cannot create DQT barrier\n", __FUNCTION__ ); return NULL; } // get pointers on DQT barrier descriptor in reference cluster barrier_xp = XPTR( ref_cxy , barrier ); // initialize global parameters in DQT barrier descriptor hal_remote_s32( XPTR( ref_cxy , &barrier->x_size ) , x_size ); hal_remote_s32( XPTR( ref_cxy , &barrier->y_size ) , x_size ); hal_remote_s32( XPTR( ref_cxy , &barrier->nthreads ) , nthreads ); #if DEBUG_BARRIER_CREATE if( cycle > DEBUG_BARRIER_CREATE ) printk("\n[%s] thread[%x,%x] created DQT barrier descriptor(%x,%x)\n", __FUNCTION__, process->pid, this->trdid, ref_cxy, barrier ); #endif // 2. allocate memory for an array of 5 DQT nodes // in all existing clusters covered by the DQDT // (5 nodes per cluster <= 512 bytes per cluster) // and complete barrier descriptor initialisation. for ( x = 0 ; x < x_size ; x++ ) { for ( y = 0 ; y < y_size ; y++ ) { cxy_t cxy = HAL_CXY_FROM_XY( x , y ); // target cluster identifier xptr_t local_array_xp; // xptr on nodes array in cluster cxy // allocate memory in existing clusters only if( LOCAL_CLUSTER->cluster_info[x][y] ) { void * ptr = kmem_remote_alloc( cxy , 9 , AF_ZERO ); // 512 bytes if( ptr == NULL ) { printk("\n[ERROR] in %s : cannot allocate DQT in cluster %x\n", __FUNCTION__, cxy ); return NULL; } // build extended pointer on local node array in cluster cxy local_array_xp = XPTR( cxy , ptr ); // initialize the node_xp[x][y][l] array in barrier descriptor for ( l = 0 ; l < levels ; l++ ) { xptr_t node_xp = local_array_xp + ( l * sizeof(dqt_node_t) ); hal_remote_s64( XPTR( ref_cxy , &barrier->node_xp[x][y][l] ), node_xp ); #if (DEBUG_BARRIER_CREATE & 1) if( cycle > DEBUG_BARRIER_CREATE ) printk(" - dqt_node_xp[%d,%d,%d] = (%x,%x) / &dqt_node_xp = %x\n", x , y , l , GET_CXY( node_xp ), GET_PTR( node_xp ), &barrier->node_xp[x][y][l] ); #endif } } else // register XPTR_NULL for all non-existing entries { for ( l = 0 ; l < levels ; l++ ) { hal_remote_s64( XPTR( ref_cxy , &barrier->node_xp[x][y][l] ), XPTR_NULL ); } } } // end for y } // end for x #if DEBUG_BARRIER_CREATE if( cycle > DEBUG_BARRIER_CREATE ) printk("\n[%s] thread[%x,%x] initialized array of pointers in DQT barrier\n", __FUNCTION__, process->pid, this->trdid ); #endif // 3. initialise all distributed DQT nodes using remote accesses // and the pointers stored in the node_xp[x][y][l] array for ( x = 0 ; x < x_size ; x++ ) { for ( y = 0 ; y < y_size ; y++ ) { // initialize existing clusters only if( LOCAL_CLUSTER->cluster_info[x][y] ) { for ( l = 0 ; l < levels ; l++ ) { xptr_t parent_xp; xptr_t child_xp[4]; uint32_t arity = 0; // get DQT node pointers xptr_t node_xp = hal_remote_l64( XPTR( ref_cxy, &barrier->node_xp[x][y][l] ) ); cxy_t node_cxy = GET_CXY( node_xp ); dqt_node_t * node_ptr = GET_PTR( node_xp ); // compute arity and child_xp[i] if (l == 0 ) // bottom DQT node { arity = nthreads; child_xp[0] = XPTR_NULL; child_xp[1] = XPTR_NULL; child_xp[2] = XPTR_NULL; child_xp[3] = XPTR_NULL; } else // not a bottom DQT node { arity = 0; // only few non-bottom nodes must be initialised if( ((x & ((1<node_xp[cx[i]][cy[i]][l-1] ) ); // increment arity arity++; } else { child_xp[i] = XPTR_NULL; } } } } // compute parent_xp if( l == (levels - 1) ) // root DQT node { parent_xp = XPTR_NULL; } else // not the root { uint32_t px = 0; // parent X coordinate uint32_t py = 0; // parent Y coordinate bool_t found = false; // compute macro_cluster x_min, x_max, y_min, y_max uint32_t x_min = x & ~((1<<(l+1))-1); uint32_t x_max = x_min + (1<<(l+1)); uint32_t y_min = y & ~((1<<(l+1))-1); uint32_t y_max = y_min + (1<<(l+1)); // scan all clusters in macro-cluster[x][y][l] / take first active for( px = x_min ; px < x_max ; px++ ) { for( py = y_min ; py < y_max ; py++ ) { if( LOCAL_CLUSTER->cluster_info[px][py] ) found = true; if( found ) break; } if( found ) break; } parent_xp = hal_remote_l64( XPTR( ref_cxy , &barrier->node_xp[px][py][l+1] ) ); } // initializes the DQT node hal_remote_s32( XPTR( node_cxy , &node_ptr->arity ) , arity ); hal_remote_s32( XPTR( node_cxy , &node_ptr->current ) , 0 ); hal_remote_s32( XPTR( node_cxy , &node_ptr->sense ) , 0 ); hal_remote_s32( XPTR( node_cxy , &node_ptr->level ) , l ); hal_remote_s64( XPTR( node_cxy , &node_ptr->parent_xp ) , parent_xp ); hal_remote_s64( XPTR( node_cxy , &node_ptr->child_xp[0] ) , child_xp[0] ); hal_remote_s64( XPTR( node_cxy , &node_ptr->child_xp[1] ) , child_xp[1] ); hal_remote_s64( XPTR( node_cxy , &node_ptr->child_xp[2] ) , child_xp[2] ); hal_remote_s64( XPTR( node_cxy , &node_ptr->child_xp[3] ) , child_xp[3] ); xlist_root_init( XPTR( node_cxy , &node_ptr->root ) ); remote_busylock_init( XPTR( node_cxy , &node_ptr->lock ), LOCK_BARRIER_STATE ); } } } } #if DEBUG_BARRIER_CREATE cycle = (uint32_t)hal_get_cycles(); if( cycle > DEBUG_BARRIER_CREATE ) printk("\n[%s] thread[%x,%x] completed DQT barrier initialisation / cycle %d\n", __FUNCTION__, process->pid, this->trdid, cycle ); dqt_barrier_display( barrier_xp ); #endif return barrier; } // end dqt_barrier_create() /////////////////////////////////////////////// void dqt_barrier_destroy( xptr_t barrier_xp ) { uint32_t x; uint32_t y; // get DQT barrier descriptor cluster and local pointer dqt_barrier_t * barrier_ptr = GET_PTR( barrier_xp ); cxy_t barrier_cxy = GET_CXY( barrier_xp ); #if DEBUG_BARRIER_DESTROY thread_t * this = CURRENT_THREAD; uint32_t cycle = (uint32_t)hal_get_cycles(); if( cycle > DEBUG_BARRIER_DESTROY ) printk("\n[%s] thread[%x,%x] enter for barrier (%x,%x) / cycle %d\n", __FUNCTION__, this->process->pid, this->trdid, barrier_cxy, barrier_ptr, cycle ); #endif // get x_size and y_size global parameters uint32_t x_size = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->x_size ) ); uint32_t y_size = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->y_size ) ); // 1. release memory allocated for the DQT nodes // in all clusters covered by the QDT mesh for ( x = 0 ; x < x_size ; x++ ) { for ( y = 0 ; y < y_size ; y++ ) { // compute target cluster identifier cxy_t cxy = HAL_CXY_FROM_XY( x , y ); // existing cluster only if( LOCAL_CLUSTER->cluster_info[x][y] ) { // get local pointer on dqt_nodes array in target cluster xptr_t buf_xp_xp = XPTR( barrier_cxy , &barrier_ptr->node_xp[x][y][0] ); xptr_t buf_xp = hal_remote_l64( buf_xp_xp ); void * buf = GET_PTR( buf_xp ); kmem_remote_free( cxy , buf , 9 ); // 512 bytes #if DEBUG_BARRIER_DESTROY thread_t * this = CURRENT_THREAD; uint32_t cycle = (uint32_t)hal_get_cycles(); if( cycle > DEBUG_BARRIER_DESTROY ) printk("\n[%s] thread[%x,%x] released node array %x in cluster %x / cycle %d\n", __FUNCTION__, this->process->pid, this->trdid, buf, cxy, cycle ); #endif } } } // 2. release memory allocated for barrier descriptor in ref cluster kmem_remote_free( barrier_cxy, barrier_ptr, CONFIG_PPM_PAGE_ORDER + 2 ); // 4 small pages #if DEBUG_BARRIER_DESTROY cycle = (uint32_t)hal_get_cycles(); if( cycle > DEBUG_BARRIER_DESTROY ) printk("\n[%s] thread[%x,%x] release barrier descriptor (%x,%x) / cycle %d\n", __FUNCTION__, this->process->pid, this->trdid, barrier_cxy, barrier_ptr, cycle ); #endif } // end dqt_barrier_destroy() //////////////////////////////////////////// void dqt_barrier_wait( xptr_t barrier_xp ) { thread_t * this = CURRENT_THREAD; // check calling thread can yield thread_assert_can_yield( this , __FUNCTION__ ); // get cluster and local pointer on DQT barrier descriptor dqt_barrier_t * barrier_ptr = GET_PTR( barrier_xp ); cxy_t barrier_cxy = GET_CXY( barrier_xp ); #if DEBUG_BARRIER_WAIT uint32_t cycle = (uint32_t)hal_get_cycles(); if( cycle > DEBUG_BARRIER_WAIT ) printk("\n[%s] thread[%x,%x] enter / barrier (%x,%x) / cycle %d\n", __FUNCTION__, this->process->pid, this->trdid, barrier_cxy, barrier_ptr, cycle ); #endif // get extended pointer on local bottom DQT node uint32_t x = HAL_X_FROM_CXY( local_cxy ); uint32_t y = HAL_Y_FROM_CXY( local_cxy ); xptr_t node_xp = hal_remote_l64( XPTR( barrier_cxy , &barrier_ptr->node_xp[x][y][0] ) ); // call recursive function to traverse DQT from bottom to root dqt_barrier_increment( node_xp ); #if DEBUG_BARRIER_WAIT cycle = (uint32_t)hal_get_cycles(); if( cycle > DEBUG_BARRIER_WAIT ) printk("\n[%s] thread[%x,%x] exit / barrier (%x,%x) / cycle %d\n", __FUNCTION__, this->trdid, this->process->pid, barrier_cxy, barrier_ptr, cycle ); #endif } // end dqt_barrier_wait() ////////////////////////////////////////////// void dqt_barrier_display( xptr_t barrier_xp ) { // get cluster and local pointer on DQT barrier dqt_barrier_t * barrier_ptr = GET_PTR( barrier_xp ); cxy_t barrier_cxy = GET_CXY( barrier_xp ); // get barrier global parameters uint32_t x_size = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->x_size ) ); uint32_t y_size = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->y_size ) ); uint32_t nthreads = hal_remote_l32( XPTR( barrier_cxy , &barrier_ptr->nthreads ) ); // compute size and number of DQT levels uint32_t z = (x_size > y_size) ? x_size : y_size; uint32_t levels = (z < 2) ? 1 : (z < 3) ? 2 : (z < 5) ? 3 : (z < 9) ? 4 : 5; printk("\n***** DQT barrier : x_size %d / y_size %d / nthreads %d / levels %d *****\n", x_size, y_size, nthreads, levels ); uint32_t x , y , l; for ( x = 0 ; x < x_size ; x++ ) { for ( y = 0 ; y < y_size ; y++ ) { printk(" - cluster[%d,%d]\n", x , y ); for ( l = 0 ; l < levels ; l++ ) { // get pointers on target node xptr_t node_xp = hal_remote_l64( XPTR( barrier_cxy , &barrier_ptr->node_xp[x][y][l] ) ); dqt_node_t * node_ptr = GET_PTR( node_xp ); cxy_t node_cxy = GET_CXY( node_xp ); if( node_xp != XPTR_NULL ) { uint32_t level = hal_remote_l32( XPTR( node_cxy , &node_ptr->level )); xptr_t pa_xp = hal_remote_l32( XPTR( node_cxy , &node_ptr->parent_xp )); xptr_t c0_xp = hal_remote_l32( XPTR( node_cxy , &node_ptr->child_xp[0] )); xptr_t c1_xp = hal_remote_l32( XPTR( node_cxy , &node_ptr->child_xp[1] )); xptr_t c2_xp = hal_remote_l32( XPTR( node_cxy , &node_ptr->child_xp[2] )); xptr_t c3_xp = hal_remote_l32( XPTR( node_cxy , &node_ptr->child_xp[3] )); printk(" . level %d : (%x,%x) / P(%x,%x) / C0(%x,%x)" " C1(%x,%x) / C2(%x,%x) / C3(%x,%x)\n", level, node_cxy, node_ptr, GET_CXY(pa_xp), GET_PTR(pa_xp), GET_CXY(c0_xp), GET_PTR(c0_xp), GET_CXY(c1_xp), GET_PTR(c1_xp), GET_CXY(c2_xp), GET_PTR(c2_xp), GET_CXY(c3_xp), GET_PTR(c3_xp) ); } } } } } // end dqt_barrier_display() ////////////////////////////////////////////////////////////////////////////////////////// // This static (recursive) function is called by the dqt_barrier_wait() function. // It traverses the DQT from bottom to root, and decrements the "current" variables. // For each traversed node, it blocks and deschedules if it is not the last expected // thread. The last arrived thread reset the local node before returning. ////////////////////////////////////////////////////////////////////////////////////////// static void dqt_barrier_increment( xptr_t node_xp ) { uint32_t expected; uint32_t sense; uint32_t arity; thread_t * this = CURRENT_THREAD; // get node cluster and local pointer dqt_node_t * node_ptr = GET_PTR( node_xp ); cxy_t node_cxy = GET_CXY( node_xp ); // build relevant extended pointers xptr_t arity_xp = XPTR( node_cxy , &node_ptr->arity ); xptr_t sense_xp = XPTR( node_cxy , &node_ptr->sense ); xptr_t current_xp = XPTR( node_cxy , &node_ptr->current ); xptr_t lock_xp = XPTR( node_cxy , &node_ptr->lock ); xptr_t root_xp = XPTR( node_cxy , &node_ptr->root ); #if DEBUG_BARRIER_WAIT uint32_t cycle = (uint32_t)hal_get_cycles(); uint32_t level = hal_remote_l32( XPTR( node_cxy, &node_ptr->level ) ); if( cycle > DEBUG_BARRIER_WAIT ) printk("\n[%s] thread[%x,%x] increments DQT node(%d,%d,%d) / cycle %d\n", __FUNCTION__ , this->process->pid, this->trdid, HAL_X_FROM_CXY(node_cxy), HAL_Y_FROM_CXY(node_cxy), level ); #endif // get extended pointer on parent node xptr_t parent_xp = hal_remote_l64( XPTR( node_cxy , &node_ptr->parent_xp ) ); // take busylock remote_busylock_acquire( lock_xp ); // get sense and arity values from barrier descriptor sense = hal_remote_l32( sense_xp ); arity = hal_remote_l32( arity_xp ); // compute expected value expected = (sense == 0) ? 1 : 0; // increment current number of arrived threads / get value before increment uint32_t current = hal_remote_atomic_add( current_xp , 1 ); // last arrived thread reset the local node, makes the recursive call // on parent node, and reactivates all waiting thread when returning. // other threads block, register in queue, and deschedule. if ( current == (arity - 1) ) // last thread { #if DEBUG_BARRIER_WAIT if( cycle > DEBUG_BARRIER_WAIT ) printk("\n[%s] thread[%x,%x] reset DQT node(%d,%d,%d)\n", __FUNCTION__ , this->process->pid, this->trdid, HAL_X_FROM_CXY(node_cxy), HAL_Y_FROM_CXY(node_cxy), level ); #endif // reset the current node hal_remote_s32( sense_xp , expected ); hal_remote_s32( current_xp , 0 ); // release busylock protecting the current node remote_busylock_release( lock_xp ); // recursive call on parent node when current node is not the root if( parent_xp != XPTR_NULL) dqt_barrier_increment( parent_xp ); // unblock all waiting threads on this node while( xlist_is_empty( root_xp ) == false ) { // get pointers on first waiting thread xptr_t thread_xp = XLIST_FIRST( root_xp , thread_t , wait_list ); cxy_t thread_cxy = GET_CXY( thread_xp ); thread_t * thread_ptr = GET_PTR( thread_xp ); #if (DEBUG_BARRIER_WAIT & 1) trdid_t trdid = hal_remote_l32( XPTR( thread_cxy , &thread_ptr->trdid ) ); process_t * process = hal_remote_lpt( XPTR( thread_cxy , &thread_ptr->process ) ); pid_t pid = hal_remote_l32( XPTR( thread_cxy , &process->pid ) ); if( cycle > DEBUG_BARRIER_WAIT ) printk("\n[%s] thread[%x,%x] unblock thread[%x,%x]\n", __FUNCTION__, this->process->pid, this->trdid, pid, trdid ); #endif // remove waiting thread from queue xlist_unlink( XPTR( thread_cxy , &thread_ptr->wait_list ) ); // unblock waiting thread thread_unblock( thread_xp , THREAD_BLOCKED_USERSYNC ); } } else // not the last thread { // get extended pointer on xlist entry from thread xptr_t entry_xp = XPTR( local_cxy , &this->wait_list ); // register calling thread in barrier waiting queue xlist_add_last( root_xp , entry_xp ); // block calling thread thread_block( XPTR( local_cxy , this ) , THREAD_BLOCKED_USERSYNC ); // release busylock protecting the remote_barrier remote_busylock_release( lock_xp ); #if DEBUG_BARRIER_WAIT if( cycle > DEBUG_BARRIER_WAIT ) printk("\n[%s] thread[%x,%x] blocks on node(%d,%d,%d)\n", __FUNCTION__ , this->process->pid, this->trdid, HAL_X_FROM_CXY(node_cxy), HAL_Y_FROM_CXY(node_cxy), level ); #endif // deschedule sched_yield("blocked on barrier"); } return; } // end dqt_barrier_decrement()