/////////////////////////////////////////////////////////////////////////////// // File : sort.c // Date : November 2013 // Author : Cesar Fuguet Tortolero /////////////////////////////////////////////////////////////////////////////// // This multi-threaded application implement a multi-stage sort application. // The various stages are separated by synchronisation barriers. // There is one thread per physical cores. // Computation is organised as a binary tree: // - All threads execute in parallel a buble sort on a sub-array during the // the first stage of parallel sort, // - The number of participating threads is divided by 2 at each next stage, // to make a merge sort, on two subsets of previous stage. // // Number_of_stages = number of barriers = log2(Number_of_threads) // // Constraints : // - It supports up to 1024 cores: x_size, y_size, and ncores must be // power of 2 (max 16*16 clusters / max 4 cores per cluster) // _ The array of values to be sorted (ARRAY_LENGTH) must be power of 2 // larger than the number of cores. /////////////////////////////////////////////////////////////////////////////// #include #include #include #include #include #define ARRAY_LENGTH 0x100 // 256 values #define VERBOSE 0 /////////////////////////////////////////////////////// // macros for fixed format cxy <=> (x,y) translation /////////////////////////////////////////////////////// #define CXY_FROM_XY( x , y ) ((x<<4) + y) #define X_FROM_CXY( cxy ) ((cxy>>4) & 0xF) #define Y_FROM_CXY( cxy ) (cxy & 0xF) ///////////////////////////////////////////////////////////// // argument for the sort() function (one thread per core) ///////////////////////////////////////////////////////////// typedef struct { unsigned int threads; // total number of threads unsigned int thread_uid; // thread user index (0 to threads -1) unsigned int main_uid; // main thread user index } args_t; ////////////////////////////////////////// // Global variables ////////////////////////////////////////// int array0[ARRAY_LENGTH]; // values to sort int array1[ARRAY_LENGTH]; pthread_barrier_t barrier; // synchronisation variables //////////////////////////////////// void bubbleSort( int * array, unsigned int length, unsigned int init_pos ) { int i; int j; int aux; for(i = 0; i < length; i++) { for(j = init_pos; j < (init_pos + length - i - 1); j++) { if(array[j] > array[j + 1]) { aux = array[j + 1]; array[j + 1] = array[j]; array[j] = aux; } } } } // end bubbleSort() ///////////////////////// void merge( int * src, int * dst, int length, int init_pos_src_a, int init_pos_src_b, int init_pos_dst ) { int i; int j; int k; i = 0; j = 0; k = init_pos_dst; while((i < length) || (j < length)) { if((i < length) && (j < length)) { if(src[init_pos_src_a + i] < src[init_pos_src_b + j]) { dst[k++] = src[init_pos_src_a + i]; i++; } else { dst[k++] = src[init_pos_src_b + j]; j++; } } else if(i < length) { dst[k++] = src[init_pos_src_a + i]; i++; } else { dst[k++] = src[init_pos_src_b + j]; j++; } } } // end merge() ///////////////////////// void sort( args_t * ptr ) { unsigned int i; unsigned long long cycle; int * src_array = NULL; int * dst_array = NULL; unsigned int thread_uid = ptr->thread_uid; unsigned int threads = ptr->threads; unsigned int main_uid = ptr->main_uid; unsigned int items = ARRAY_LENGTH / threads; unsigned int stages = __builtin_ctz( threads ) + 1; get_cycle( &cycle ); printf("\n[SORT] thread[%d] enter at cycle %d\n", thread_uid , (unsigned int)cycle ); printf("\n[SORT] thread[%d] / stage 0 start\n", thread_uid ); bubbleSort( array0, items, items * thread_uid ); printf("\n[SORT] thread[%d] / stage 0 completed\n", thread_uid ); ///////////////////////////////// pthread_barrier_wait( &barrier ); // the number of threads contributing to sort // is divided by 2 at each next stage for ( i = 1 ; i < stages ; i++ ) { pthread_barrier_wait( &barrier ); if( (thread_uid & ((1< res_array[n+1] ) { success = 0; break; } } #if VERBOSE printf("\n*** array after sort\n"); for( n=0; n