Changeset 637


Ignore:
Timestamp:
Jul 18, 2019, 2:06:55 PM (4 months ago)
Author:
alain
Message:

Introduce the non-standard pthread_parallel_create() system call
and re-write the <fft> and <sort> applications to improve the
intrinsic paralelism in applications.

Location:
trunk
Files:
69 edited

Legend:

Unmodified
Added
Removed
  • trunk/boot/tsar_mips32/boot.c

    r624 r637  
    954954#if DEBUG_BOOT_MULTI
    955955boot_printf("\n[BOOT] core[%x,%d] jump to kernel_init = %x at cycle %d\n",
    956 cxy, lid, __FUNCTION__, kernel_entry, boot_get_proctime() );
     956cxy, lid, kernel_entry, boot_get_proctime() );
    957957#endif
    958958
  • trunk/hal/generic/hal_uspace.h

    r626 r637  
    3131//
    3232// When moving data between user space and kernel space, the user address is always
    33 // a virtual address, but the kernel address can be a physical address, on 32 bits
    34 // architectures, and require MMU dynamic activation/deactivation.
     33// a virtual address, but the kernel address is an extended pointer.
    3534// For sake of portability, user/kernel data transfers must use the following API.
    3635//////////////////////////////////////////////////////////////////////////////////////////
     
    4140 * that can be located in any cluster.
    4241 *****************************************************************************************
    43  * @ k_cxy     : cluster identifier for kernel destination buffer.
    44  * @ k_dst     : local pointer on kernel destination buffer.
    45  * @ u_src     : source buffer address in user space.
     42 * @ k_dst_xp  : extended pointer on kernel destination buffer.
     43 * @ u_src_ptr : source buffer address in user space.
    4644 * @ size      : size (number of bytes).
    4745 ****************************************************************************************/
    48 extern void hal_copy_from_uspace( cxy_t      k_cxy,
    49                                   void     * k_dst,
    50                                   void     * u_src,
     46extern void hal_copy_from_uspace( xptr_t     k_dst_xp,
     47                                  void     * u_src_ptr,
    5148                                  uint32_t   size );
    5249
     
    5552 * to a data buffer in the user space.
    5653 *****************************************************************************************
    57  * @ k_cxy     : cluster identifier for kernel source buffer.
    58  * @ k_src     : local pointer on kernel source buffer.
    59  * @ u_dst     : destination buffer address in user space.
     54 * @ u_dst_ptr : destination buffer address in user space.
     55 * @ k_src_xp  : extended pointer on kernel source buffer.
    6056 * @ size      : size (number of bytes).
    6157 ****************************************************************************************/
    62 extern void hal_copy_to_uspace( cxy_t      k_cxy,
    63                                 void     * k_src,
    64                                 void     * u_dst,
     58extern void hal_copy_to_uspace( void     * u_dst_ptr,
     59                                xptr_t     k_src_xp,
    6560                                uint32_t   size );
    6661
     
    6964 * The transfer stops after the first encountered NUL character, and no more than
    7065 * <max_size> characters are actually copied to target buffer.
    71  * If the kernel uses physical addresses, it activates the MMU to access the user buffer.
    7266 *****************************************************************************************
    73  * @ u_dst     : destination buffer address in user space.
    74  * @ k_src     : source address in kernel space.
     67 * @ k_dst_xp  : extended pointer on kernel destination buffer.
     68 * @ u_src_ptr : source address in user space.
    7569 * @ max_size  : max number of characters to be copied.
    7670 ****************************************************************************************/
    77 extern void hal_strcpy_from_uspace( char     * k_dst,
    78                                     char     * u_src,
     71extern void hal_strcpy_from_uspace( xptr_t     k_dst_xp,
     72                                    char     * u_src_ptr,
    7973                                    uint32_t   max_size );
    8074
     
    8377 * The transfer stops after the first encountered NUL character, and no more than
    8478 * <max_size> characters are actually copied to target buffer.
    85  * If the kernel uses physical addresses, it activates the MMU to access the user buffer.
    8679 *****************************************************************************************
    87  * @ u_dst    : destination buffer address in user space.
    88  * @ k_src     : source address in kernel space.
     80 * @ u_dst_ptr : destination buffer address in user space.
     81 * @ k_src_xp  : extended pointer on kernel source buffer.
    8982 * @ max_size  : max number of characters to be copied.
    9083 ****************************************************************************************/
    91 extern void hal_strcpy_to_uspace( char     * u_dst,
    92                                   char     * k_src,
     84extern void hal_strcpy_to_uspace( char     * u_dst_ptr,
     85                                  xptr_t     k_src_xp,
    9386                                  uint32_t   max_size );
    9487
    9588/*****************************************************************************************
    9689 * This function computes the length of a string in user space.
    97  * If the kernel uses physical addresses, it activates the MMU to access the user buffer.
    9890 *****************************************************************************************
    9991 * @ string     : string in user space.
  • trunk/hal/tsar_mips32/core/hal_gpt.c

    r635 r637  
    133133///////////////////////////////////////////////////////////////////////////////////////
    134134
    135 #define GPT_LOCK_WATCHDOG   100000
     135#define GPT_LOCK_WATCHDOG   1000000
    136136
    137137/////////////////////////////////////
  • trunk/hal/tsar_mips32/core/hal_uspace.c

    r626 r637  
    3232///////////////////////////////////////////////////////////////////////////////////////
    3333// This function moves <size> bytes from a source buffer in user virtual space,
    34 // defined by the <u_src> argument, to a destination kernel buffer, defined by the
    35 // <k_cxy> and <k_dst> arguments.
    36 // It works in a critical section, as it modifies briefly two CP2 registers:
     34// defined by the <u_src_ptr> argument, to a destination kernel buffer, defined by the
     35// <k_dst_xp> argument.
     36// It works in a critical section, as it modifies two CP2 registers:
    3737// It activates briefly the DATA_MMU by writing into the CP2_MODE register to access the
    3838// user buffer, and modifies the CP2_DATA_EXT register to access the kernel buffer.
     
    4141// If the buffers are not aligned, it moves all data byte per byte.
    4242///////////////////////////////////////////////////////////////////////////////////////
    43 // @ k_cxy    : cluster of destination kernel buffer
    44 // @ k_dst    : pointer on destination kernel buffer
    45 // @ u_src    : pointer on source user buffer
     43// @ k_dst_xp  : extended pointer on destination kernel buffer
     44// @ u_src_ptr : pointer on source user buffer
    4645// @ size     : number of bytes to move
    4746///////////////////////////////////////////////////////////////////////////////////////
    48 void hal_copy_from_uspace( cxy_t      k_cxy,
    49                            void     * k_dst,
    50                            void     * u_src,
     47void hal_copy_from_uspace( xptr_t     k_dst_xp,
     48                           void     * u_src_ptr,
    5149                           uint32_t   size ) 
    5250{
    5351    uint32_t save_sr;
    54         uint32_t words;                        // number of words (if buffers aligned)
    55     uint32_t src = (uint32_t)u_src;
    56     uint32_t dst = (uint32_t)k_dst;
     52        uint32_t words;                            // number of words (if buffers aligned)
     53    uint32_t src = (uint32_t)u_src_ptr;
     54    uint32_t dst = (uint32_t)GET_PTR( k_dst_xp );
     55    uint32_t cxy = (uint32_t)GET_CXY( k_dst_xp );
     56   
    5757 
    5858#if DEBUG_HAL_USPACE
     
    6161if( cycle > DEBUG_HAL_USPACE )
    6262printk("\n[%s] thread[%x,%x] enter / %d bytes / u_buf(%x,%x) -> k_buf(%x,%x) / cycle %d\n",
    63 __FUNCTION__, this->process->pid, this->trdid, size, local_cxy, u_src, k_cxy, k_dst, cycle );
     63__FUNCTION__, this->process->pid, this->trdid, size, local_cxy, src, cxy, dst, cycle );
    6464#endif
    6565
     
    8080                  "ori    $13,   $12,   0x4       \n"   /* $13 <= MMU_MODE with DTLB  */
    8181
    82                   /* save old MMU_DATA_EXT and set k_cxy in it                    */
     82                  /* save old MMU_DATA_EXT and set cxy in it                      */
    8383                  "mfc2   $16,   $24          \n"   /* $16 <= old MMU_DATA_EXT    */
    84                   "mtc2   %4,    $24          \n"   /* MMU_DATA_EXT <= k_cxy      */
     84                  "mtc2   %4,    $24          \n"   /* MMU_DATA_EXT <= cxy        */
    8585
    8686                  /* transfer one word per iteration in first loop if aligned     */
     
    118118                  ".set reorder               \n"
    119119                  :
    120                   : "r"(src) , "r"(dst) , "r"(words) , "r"(size) , "r"(k_cxy)
     120                  : "r"(src) , "r"(dst) , "r"(words) , "r"(size) , "r"(cxy)
    121121                  : "$8","$9","$10","$11","$12","$13","$14","$15","$16","memory" );
    122122
     
    128128if( cycle > DEBUG_HAL_USPACE )
    129129printk("\n[%s] thread[%x,%x] moved %d bytes / u_buf(%x,%x) -> k_buf(%x,%x) / cycle %d\n",
    130 __FUNCTION__, this->process->pid, this->trdid, size, local_cxy, u_src, k_cxy, k_dst, cycle );
     130__FUNCTION__, this->process->pid, this->trdid, size, local_cxy, src, cxy, dst, cycle );
    131131#endif
    132132
     
    135135///////////////////////////////////////////////////////////////////////////////////////
    136136// This function moves <size> bytes from a source kernel buffer, defined by the
    137 // <k_cxy> and <k_src> arguments, to a destination buffer in user virtual space,
    138 // defined by the <u_dst> argument.
    139 // It works in a critical section, as it modifies briefly two CP2 registers:
     137// <k_src_xp> argument, to a destination buffer in user virtual space, defined by
     138// the <u_dst_ptr> argument.
     139// It works in a critical section, as it modifies two CP2 registers:
    140140// It activates briefly the DATA_MMU by writing into the CP2_MODE register to access the
    141141// user buffer, and modifies the CP2_DATA_EXT register to access the kernel buffer.
     
    144144// If the buffers are not aligned, it moves all data byte per byte.
    145145///////////////////////////////////////////////////////////////////////////////////////
    146 // @ k_cxy    : cluster of destination kernel buffer
    147 // @ k_dst    : pointer on destination kernel buffer
    148 // @ u_src    : pointer on source user buffer
    149 // @ size     : number of bytes to move
    150 ///////////////////////////////////////////////////////////////////////////////////////
    151 void hal_copy_to_uspace( cxy_t      k_cxy,
    152                          void     * k_src,
    153                          void     * u_dst,
     146// @ u_dst_ptr : pointer on destination user buffer
     147// @ k_src_xp  : extended pointer on source kernel buffer
     148// @ size      : number of bytes to move
     149///////////////////////////////////////////////////////////////////////////////////////
     150void hal_copy_to_uspace( void     * u_dst_ptr,
     151                         xptr_t     k_src_xp,
    154152                         uint32_t   size )
    155153{
    156154    uint32_t save_sr;
    157         uint32_t words;                   // number of words (if buffers aligned)
    158     uint32_t src = (uint32_t)k_src;
    159     uint32_t dst = (uint32_t)u_dst;
     155        uint32_t words;                           // number of words (if buffers aligned)
     156    uint32_t dst = (uint32_t)u_dst_ptr;
     157    uint32_t src = (uint32_t)GET_PTR( k_src_xp );
     158    uint32_t cxy = (uint32_t)GET_CXY( k_src_xp );
    160159
    161160#if DEBUG_HAL_USPACE
     
    164163if( cycle > DEBUG_HAL_USPACE )
    165164printk("\n[%s] thread[%x,%x] enter / %d bytes / k_buf(%x,%x) -> u_buf(%x,%x) / cycle %d\n",
    166 __FUNCTION__, this->process->pid, this->trdid, size, k_cxy, k_src, local_cxy, u_dst, cycle );
     165__FUNCTION__, this->process->pid, this->trdid, size, cxy, src, local_cxy, dst, cycle );
    167166#endif
    168167
     
    183182                  "ori    $13,   $12,   0x4       \n"   /* $13 <= MMU_MODE with DTLB  */
    184183
    185                   /* save old MMU_DATA_EXT and set k_cxy in it                    */
     184                  /* save old MMU_DATA_EXT and set cxy in it                      */
    186185                  "mfc2   $16,   $24          \n"   /* $16 <= old MMU_DATA_EXT    */
    187                   "mtc2   %4,    $24          \n"   /* MMU_DATA_EXT <= k_cxy      */
     186                  "mtc2   %4,    $24          \n"   /* MMU_DATA_EXT <= cxy        */
    188187
    189188                  /* transfer one word per iteration in first loop if aligned     */
     
    221220                  ".set reorder               \n"
    222221                  :
    223                   : "r"(src) , "r"(dst) , "r"(words) , "r"(size) , "r"(k_cxy)
     222                  : "r"(src) , "r"(dst) , "r"(words) , "r"(size) , "r"(cxy)
    224223                  : "$8","$9","$10","$11","$12","$13","$14","$15","$16","memory" );
    225224
     
    231230if( cycle > DEBUG_HAL_USPACE )
    232231printk("\n[%s] thread[%x,%x] moved %d bytes / k_buf(%x,%x) -> u_buf(%x,%x) / cycle %d\n",
    233 __FUNCTION__, this->process->pid, this->trdid, size, k_cxy, k_src, local_cxy, u_dst, cycle );
     232__FUNCTION__, this->process->pid, this->trdid, size, cxy, src, local_cxy, dst, cycle );
    234233#endif
    235234
    236235}  // end hal_copy_to_uspace()
    237236
    238 //////////////////////////////////////////////
    239 void hal_strcpy_from_uspace( char     * k_dst,
    240                              char     * u_src,
     237/////////////////////////////////////////////////
     238void hal_strcpy_from_uspace( xptr_t     k_dst_xp,
     239                             char     * u_src_ptr,
    241240                             uint32_t   size )
    242241{
    243242    uint32_t save_sr;
    244     uint32_t src = (uint32_t)u_src;
    245     uint32_t dst = (uint32_t)k_dst;
     243    uint32_t src = (uint32_t)u_src_ptr;
     244    uint32_t dst = (uint32_t)GET_PTR( k_dst_xp );
     245    uint32_t cxy = (uint32_t)GET_CXY( k_dst_xp );
    246246
    247247    hal_disable_irq( &save_sr );
    248248
    249249    // loop on characters while ( (character != NUL) and (count < size ) )
     250
    250251    asm volatile(
    251252        ".set noreorder             \n"
     253
     254        /* save old MMU_DATA_EXT and set cxy in it                          */
     255        "mfc2   $16,   $24          \n"   /* $16 <= old MMU_DATA_EXT        */
     256        "mtc2   %3,    $24          \n"   /* MMU_DATA_EXT <= cxy            */
     257
    252258        "move   $11,   %0           \n"   /* $11 <= count == size           */
    253259        "move   $12,   %1           \n"   /* $12 <= u_src                   */
    254260        "move   $13,   %2           \n"   /* $13 <= k_dst                   */
    255         "mfc2   $15,   $1           \n"   /* $15 <= mode DTLB and ITLB off  */
    256         "ori    $14,   $15,  0x4    \n"   /* $14 <= mode DTLB on            */
     261        "mfc2   $15,   $1           \n"   /* $15 <= MMU_MODE                */
     262        "ori    $14,   $15,  0x4    \n"   /* $14 <= MMU_MODE / DTLB ON      */
     263
    257264        "1:                         \n"
    258265        "mtc2   $14,   $1                       \n"   /* MMU_MODE <= DTLB ON            */
    259266        "lb     $10,   0($12)       \n"   /* read char from user space      */
    260         "mtc2   $15,   $1                       \n"   /* restore DTLB and ITLB off      */
     267        "mtc2   $15,   $1                       \n"   /* MMU_MODE <= DTLB OFF           */
    261268            "sb     $10,   0($13)       \n"   /* store char to kernel space     */
    262269        "beq    $10,   $0,   2f     \n"   /* exit if char = 0               */
     
    268275        "2:                         \n"
    269276        "nop                        \n"
     277
     278        /* restore old MMU_DATA_EXT register                                */
     279        "mtc2   $16,   $24          \n"   /* MMU_DATA_EXT <= $16            */
     280
    270281        ".set reorder               \n"
    271282        :
    272         : "r"(size),"r"(src),"r"(dst)
    273         : "$10","$11","$12","$13","$14","$15" );
     283        : "r"(size) , "r"(src) , "r"(dst) , "r"(cxy)
     284        : "$10","$11","$12","$13","$14","$15","$16" );
    274285       
    275286    hal_restore_irq( save_sr );
     
    277288} // hal_strcpy_from_uspace()
    278289
    279 ////////////////////////////////////////////
    280 void hal_strcpy_to_uspace( char     * u_dst,
    281                            char     * k_src,
     290////////////////////////////////////////////////
     291void hal_strcpy_to_uspace( char     * u_dst_ptr,
     292                           xptr_t     k_src_xp,
    282293                           uint32_t   size )
    283294{
    284295    uint32_t save_sr;
    285     uint32_t src = (uint32_t)k_src;
    286     uint32_t dst = (uint32_t)u_dst;
     296    uint32_t dst = (uint32_t)u_dst_ptr;
     297    uint32_t src = (uint32_t)GET_PTR( k_src_xp );
     298    uint32_t cxy = (uint32_t)GET_CXY( k_src_xp );
    287299
    288300    hal_disable_irq( &save_sr );
    289301
    290302    // loop on characters while ( (character != NUL) and (count < size) )
     303
    291304    asm volatile(
    292305        ".set noreorder             \n"
     306
     307        /* save old MMU_DATA_EXT and set cxy in it                          */
     308        "mfc2   $16,   $24          \n"   /* $16 <= old MMU_DATA_EXT        */
     309        "mtc2   %3,    $24          \n"   /* MMU_DATA_EXT <= cxy            */
     310
    293311        "move   $11,   %0           \n"   /* $11 <= count == size           */
    294312        "move   $12,   %1           \n"   /* $12 <= k_src                   */
    295313        "move   $13,   %2           \n"   /* $13 <= u_dst                   */
    296         "mfc2   $15,   $1           \n"   /* $15 <= mode DTLB and ITLB off  */
    297         "ori    $14,   $15,  0x4    \n"   /* $14 <= mode DTLB on            */
     314        "mfc2   $15,   $1           \n"   /* $15 <= MMU_MODE                */
     315        "ori    $14,   $15,  0x4    \n"   /* $14 <= MMU_MODE modified       */
     316
    298317        "1:                         \n"
    299318        "lb     $10,   0($12)       \n"   /* read char from kernel space    */
    300319        "mtc2   $14,   $1                       \n"   /* MMU_MODE <= DTLB ON            */
    301320            "sb     $10,   0($13)       \n"   /* store char to user space       */
    302         "mtc2   $15,   $1                       \n"   /* restore DTLB and ITLB off      */
     321        "mtc2   $15,   $1                       \n"   /* MMU_MODE <= DTLB OFF           */
    303322        "beq    $10,   $0,   2f     \n"   /* exit if char == 0              */
    304323        "addi   $11,   $11, -1      \n"   /* decrement count                */
    305324        "addi   $12,   $12,  1      \n"   /* increment k_src pointer        */
    306         "beq    $11,   $0,   2f     \n"   /* exit if count == size          */
     325        "beq    $11,   $0,   2f     \n"   /* exit if count == 0             */
    307326        "addi   $13,   $13,  1      \n"   /* increment u_src pointer        */
    308327        "j                   1b     \n"   /* jump to next iteration         */
    309328        "2:                         \n"
    310329        "nop                        \n"
     330
     331        /* restore old MMU_DATA_EXT register                                */
     332        "mtc2   $16,   $24          \n"   /* MMU_DATA_EXT <= $16            */
     333
    311334        ".set reorder               \n"
    312335        :
    313         : "r"(size),"r"(src),"r"(dst)
    314         : "$10","$11","$12","$13","$14","$15" );
     336        : "r"(size) , "r"(src) , "r"(dst) , "r"(cxy)
     337        : "$10","$11","$12","$13","$14","$15","$16" );
    315338       
    316339    hal_restore_irq( save_sr );
  • trunk/hal/tsar_mips32/core/hal_vmm.c

    r635 r637  
    111111printk("\n[%s] thread[%x,%x] registered kcode vseg[%x,%x] in cluster %x\n",
    112112__FUNCTION__, this->process->pid, this->trdid, info->kcode_base, info->kcode_size, local_cxy );
    113 hal_vmm_display( &process_zero , true );
     113hal_vmm_display( XPTR( local_cxy, &process_zero ) , true );
    114114#endif
    115115
     
    136136printk("\n[%s] thread[%x,%x] enter in cluster %x \n",
    137137__FUNCTION__, this->process->pid, this->trdid, cxy );
    138 hal_vmm_display( &process_zero , true );
    139 hal_vmm_display( process , true );
     138hal_vmm_display( XPTR( local_cxy , process ) , true );
    140139#endif
    141140
     
    190189__FUNCTION__, this->process->pid, this->trdid,
    191190vseg_type_str(vseg->type) , vseg->min, (vseg->max - vseg->min) );
    192 hal_vmm_display( process , true );
     191hal_vmm_display( XPTR( local_cxy , process ) , true );
    193192#endif
    194193
  • trunk/kernel/Makefile

    r633 r637  
    175175
    176176SYS_OBJS_4  = build/syscalls/sys_get_config.o      \
    177               build/syscalls/sys_get_core.o        \
     177              build/syscalls/sys_get_core_id.o     \
    178178              build/syscalls/sys_get_cycle.o       \
    179179              build/syscalls/sys_display.o         \
     
    187187SYS_OBJS_5  = build/syscalls/sys_exit.o            \
    188188              build/syscalls/sys_sync.o            \
    189               build/syscalls/sys_fsync.o
     189              build/syscalls/sys_fsync.o           \
     190              build/syscalls/sys_get_best_core.o   \
     191              build/syscalls/sys_get_nb_cores.o
    190192
    191193VFS_OBJS    = build/fs/vfs.o              \
  • trunk/kernel/devices/dev_dma.c

    r619 r637  
    22 * dev_dma.c - DMA (Interrupt Controler Unit) generic device API implementation.
    33 *
    4  * Authors   Alain Greiner  (2016,2017,2018)
     4 * Authors   Alain Greiner  (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
     
    6161    error_t    error;
    6262
     63    lid_t lid = cluster_select_local_core( local_cxy );
     64
    6365    error = thread_kernel_create( &new_thread,
    6466                                  THREAD_DEV,
    6567                                  &chdev_server_func,
    6668                                  dma,
    67                                   cluster_select_local_core() );
     69                                  lid );
    6870    if( error )
    6971    {
  • trunk/kernel/devices/dev_ioc.c

    r626 r637  
    6767
    6868    // select a core to execute the IOC server thread
    69     lid_t lid = cluster_select_local_core();
     69    lid_t lid = cluster_select_local_core( local_cxy );
    7070
    7171    // bind the IOC IRQ to the selected core
  • trunk/kernel/devices/dev_nic.c

    r619 r637  
    22 * dev_nic.c - NIC (Network Controler) generic device API implementation.
    33 *
    4  * Author  Alain Greiner    (2016,2017,2018)
     4 * Author  Alain Greiner    (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
     
    5858
    5959    // select a core to execute the NIC server thread
    60     lid_t lid = cluster_select_local_core();
     60    lid_t lid = cluster_select_local_core( local_cxy );
    6161
    6262    // bind the NIC IRQ to the selected core
  • trunk/kernel/devices/dev_txt.c

    r626 r637  
    9595    {
    9696        // select a core to execute the server thread
    97         lid_t lid = cluster_select_local_core();
     97        lid_t lid = cluster_select_local_core( local_cxy );
    9898
    9999        // The unique IRQ from cluster 00's MTTY must be bound to a RX chdev
     
    131131        thread_unblock( XPTR( local_cxy , new_thread ) , THREAD_BLOCKED_GLOBAL );
    132132    }
    133 }
     133}  // end dev_txt_init()
    134134
    135135//////////////////////////////////////////////////////////////////////////////////
     
    166166    // return I/O operation status from calling thread descriptor
    167167    return this->txt_cmd.error;
    168 }
     168
     169}  // end dev_txt_access()
    169170
    170171/////////////////////////////////////////
     
    173174                       uint32_t   count )
    174175{
     176    error_t error;
    175177
    176178#if (DEBUG_SYS_WRITE & 1)
     
    182184uint32_t   cycle = (uint32_t)hal_get_cycles();
    183185if( DEBUG_DEV_TXT_TX < cycle )
    184 printk("\n[%s] thread[%x,%x] enters / cycle %d\n",
    185 __FUNCTION__, this->process->pid, this->trdid, cycle );
    186 #endif
    187 
    188     // get extended pointer on TXT[0] chdev
     186printk("\n[%s] thread[%x,%x] enters for <%s> / cycle %d\n",
     187__FUNCTION__, this->process->pid, this->trdid, buffer, cycle );
     188#endif
     189
     190    // If we use MTTY (vci_multi_tty), we do a synchronous write on TXT[0]
     191    // If we use TTY  (vci_tty_tsar), we do a standard asynchronous write
     192    // TODO this is not very clean ... [AG]
     193
     194    // get pointers on chdev
    189195    xptr_t dev_xp = chdev_dir.txt_tx[0];
    190 
    191     assert( (dev_xp != XPTR_NULL) , __FUNCTION__ ,
    192     "undefined TXT0 chdev descriptor" );
    193 
    194     // get TXTO chdev cluster and local pointer
    195     cxy_t    dev_cxy  = GET_CXY( dev_xp );
    196     chdev_t * dev_ptr = (chdev_t *)GET_PTR( dev_xp );
    197 
    198     // If we use MTTYs (vci_multi_tty), we perform only sync writes
    199     // Otherwise, we use vci_tty_tsar so we can use async writes
     196    cxy_t     dev_cxy = GET_CXY( dev_xp );
     197    chdev_t * dev_ptr = GET_PTR( dev_xp );
    200198
    201199    if( dev_ptr->impl == IMPL_TXT_MTY )
     
    211209        args.channel = channel;
    212210
    213         // call driver function
     211        // call directly the driver function
    214212        aux( &args );
    215213
    216         return 0;
    217     }
    218 
     214        error = 0;
     215    }
    219216    else
    220217    {
    221         return dev_txt_access( TXT_WRITE , channel , buffer , count );
     218        // register command in chdev queue for an asynchronous access
     219        error = dev_txt_access( TXT_WRITE , channel , buffer , count );
     220
     221        if( error )
     222        {
     223            printk("\n[ERROR] in %s : cannot write string %s / cycle %d\n",
     224            __FUNCTION__, buffer, (uint32_t)hal_get_cycles() );
     225        }
    222226    }
    223227
     
    225229cycle = (uint32_t)hal_get_cycles();
    226230if( DEBUG_DEV_TXT_TX < cycle )
    227 printk("\n[%s] thread[%x,%x] exit / cycle %d\n",
     231printk("\n[%s] thread[%x,%x] exit /  cycle %d\n",
    228232__FUNCTION__, this->process->pid, this->trdid, cycle );
    229233#endif
     
    233237#endif
    234238
    235 }
     239    return error;
     240
     241}  // end dev_txt_write()
    236242
    237243/////////////////////////////////////////
     
    239245                      char     * buffer )
    240246{
     247    error_t error;
    241248
    242249#if (DEBUG_SYS_READ & 1)
     
    252259#endif
    253260
    254     return dev_txt_access( TXT_READ , channel , buffer , 1 );
     261    // register command in chdev queue for an asynchronous access
     262    error = dev_txt_access( TXT_READ , channel , buffer , 1 );
     263
     264    if( error )
     265    {
     266        printk("\n[ERROR] in %s : cannot get character / cycle %d\n",
     267        __FUNCTION__, (uint32_t)hal_get_cycles() );
     268    }
    255269
    256270#if DEBUG_DEV_TXT_RX
    257271cycle = (uint32_t)hal_get_cycles();
    258272if( DEBUG_DEV_TXT_RX < cycle )
    259 printk("\n[%s] thread[%x,%x] exit / cycle %d\n",
    260 __FUNCTION__, this->process->pid, this->trdid, cycle );
     273printk("\n[%s] thread[%x,%x] get character <%c> / cycle %d\n",
     274__FUNCTION__, this->process->pid, this->trdid, *buffer, cycle );
    261275#endif
    262276
     
    265279#endif
    266280
    267 }
     281    return error;
     282
     283}  // end dev_txt_read()
    268284
    269285////////////////////////////////////////////////
  • trunk/kernel/devices/dev_txt.h

    r626 r637  
    124124 * device and the driver specific data structures when required.
    125125 * It creates the associated server thread and allocates a WTI from local ICU.
    126  * It must de executed by a local thread.
     126 * It must be executed by a thread running in cluster containing the chdev descriptor.
    127127 ******************************************************************************************
    128128 * @ chdev     : local pointer on TXT device descriptor.
     
    134134 * by the "channel" argument. The corresponding request is actually registered in the
    135135 * chdev requests queue, and the calling thread is descheduled, blocked until
    136  * transfer completion.
    137  * It must be called in the client cluster.
     136 * transfer completion. It can be called by any thread running in any cluster.
    138137 ******************************************************************************************
    139138 * @ channel   : TXT channel index.
     
    148147 * by the "channel" argument. The corresponding request is actually registered in the
    149148 * chdev requests queue, and the calling thread is descheduled, blocked until
    150  * transfer completion.
    151  * It must be called in the client cluster.
     149 * transfer completion. It can be called by any thread running in any cluster.
    152150 ******************************************************************************************
    153151 * @ channel   : TXT channel index.
     
    166164 * interfering with another possible TXT access to another terminal.
    167165 * As it is used for debug, the command arguments <buffer> and <count> are registerd
    168  * in a specific "txt_syc_args_t" structure passed to the driver "aux" function.
     166 * in a specific "txt_sync_args_t" structure passed to the driver "aux" function.
    169167 ****************************************************************************************
    170168 * @ buffer    : local pointer on source buffer containing the string.
  • trunk/kernel/fs/devfs.c

    r635 r637  
    675675
    676676            // move burst bytes from k_buf to u_buf                   
    677             hal_strcpy_to_uspace( u_buf , k_buf , burst );
     677            hal_strcpy_to_uspace( u_buf,
     678                                  XPTR( local_cxy , k_buf ),
     679                                  burst );
    678680
    679681            // update loop variables
     
    704706
    705707            // move burst bytes from u_buf to k_buf
    706             hal_strcpy_from_uspace( k_buf , u_buf , burst );
     708            hal_strcpy_from_uspace( XPTR( local_cxy , k_buf ) , u_buf , burst );
    707709
    708710            // write burst bytes from kernel buffer to TXT device
  • trunk/kernel/kern/cluster.c

    r635 r637  
    7676
    7777    // initialize the cluster_info[][] array
    78     for (x = 0; x < CONFIG_MAX_CLUSTERS_X; x++)
    79     {
    80         for (y = 0; y < CONFIG_MAX_CLUSTERS_Y;y++)
     78    for( x = 0 ; x < CONFIG_MAX_CLUSTERS_X ; x++ )
     79    {
     80        for( y = 0; y < CONFIG_MAX_CLUSTERS_Y ; y++ )
    8181        {
    8282            cluster->cluster_info[x][y] = info->cluster_info[x][y];
     
    9595    }
    9696
    97     // initialize number of cores
     97    // initialize number of local cores
    9898        cluster->cores_nr  = info->cores_nr;
    9999
    100100}  // end cluster_info_init()
     101
     102//////////////////////////////////////
     103void cluster_info_display( cxy_t cxy )
     104{
     105    uint32_t  x;
     106    uint32_t  y;
     107    uint32_t  ncores;
     108
     109    cluster_t * cluster = LOCAL_CLUSTER;
     110
     111    // get x_size & y_size from target cluster
     112    uint32_t  x_size = hal_remote_l32( XPTR( cxy , &cluster->x_size ) );
     113    uint32_t  y_size = hal_remote_l32( XPTR( cxy , &cluster->y_size ) );
     114
     115    // get pointers on TXT0 chdev
     116    xptr_t    txt0_xp  = chdev_dir.txt_tx[0];
     117    cxy_t     txt0_cxy = GET_CXY( txt0_xp );
     118    chdev_t * txt0_ptr = GET_PTR( txt0_xp );
     119
     120    // get extended pointer on remote TXT0 lock
     121    xptr_t  lock_xp = XPTR( txt0_cxy , &txt0_ptr->wait_lock );
     122
     123    // get TXT0 lock
     124    remote_busylock_acquire( lock_xp );
     125
     126    nolock_printk("\n***** cluster_info in cluster %x / x_size %d / y_size %d\n",
     127    cxy, x_size, y_size );
     128 
     129    for( x = 0 ; x < x_size ; x++ )
     130    {
     131        for( y = 0 ; y < y_size ; y++ )
     132        {
     133            ncores = (uint32_t)hal_remote_lb( XPTR( cxy , &cluster->cluster_info[x][y] ) );
     134            nolock_printk(" - ncores[%d][%d] = %d\n", x, y, ncores );
     135        }
     136    }
     137
     138    // release TXT0 lock
     139    remote_busylock_release( lock_xp );
     140
     141}  // end cluster_info_display()
    101142
    102143/////////////////////////////////////////////////////////
     
    115156printk("\n[%s] thread[%x,%x] enters for cluster %x / cycle %d\n",
    116157__FUNCTION__, this->process->pid, this->trdid, local_cxy , cycle );
     158#endif
     159
     160#if (DEBUG_CLUSTER_INIT & 1)
     161cluster_info_display( local_cxy );
    117162#endif
    118163
     
    243288}
    244289
    245 ////////////////////////////////////////
    246 bool_t cluster_is_undefined( cxy_t cxy )
    247 {
    248     uint32_t  x_size = LOCAL_CLUSTER->x_size;
    249     uint32_t  y_size = LOCAL_CLUSTER->y_size;
    250 
    251     uint32_t  x      = HAL_X_FROM_CXY( cxy );
    252     uint32_t  y      = HAL_Y_FROM_CXY( cxy );
    253 
    254     if( x >= x_size ) return true;
    255     if( y >= y_size ) return true;
    256 
    257     return false;
    258 }
    259 
    260 //////////////////////////////////////
    261 bool_t cluster_is_active ( cxy_t cxy )
     290/////////////////////////////////////////////
     291inline bool_t cluster_is_active ( cxy_t cxy )
    262292{
    263293    uint32_t x = HAL_X_FROM_CXY( cxy );
     
    271301////////////////////////////////////////////////////////////////////////////////////
    272302
    273 ///////////////////////////////////////
    274 lid_t cluster_select_local_core( void )
    275 {
    276     uint32_t      min = 1000;
     303/////////////////////////////////////////////
     304lid_t cluster_select_local_core( cxy_t  cxy )
     305{
     306    uint32_t      min = 1000000;
    277307    lid_t         sel = 0;
    278308    uint32_t      nthreads;
    279309    lid_t         lid;
    280310    scheduler_t * sched;
    281 
    282     cluster_t * cluster = LOCAL_CLUSTER;
    283 
    284     for( lid = 0 ; lid < cluster->cores_nr ; lid++ )
    285     {
    286         sched    = &cluster->core_tbl[lid].scheduler;
    287         nthreads = sched->u_threads_nr + sched->k_threads_nr;
     311    cluster_t   * cluster = LOCAL_CLUSTER;
     312    uint32_t      ncores = hal_remote_l32( XPTR( cxy , &cluster->cores_nr ) );
     313
     314    for( lid = 0 ; lid < ncores ; lid++ )
     315    {
     316        sched  = &cluster->core_tbl[lid].scheduler;
     317
     318        nthreads = hal_remote_l32( XPTR( cxy , &sched->u_threads_nr ) ) +
     319                   hal_remote_l32( XPTR( cxy , &sched->k_threads_nr ) );
    288320
    289321        if( nthreads < min )
     
    700732    uint32_t      pref_nr;       // number of owned processes in cluster cxy
    701733
    702 assert( (cluster_is_undefined( cxy ) == false), "illegal cluster index" );
     734assert( (cluster_is_active( cxy ) ), "illegal cluster index" );
    703735
    704736    // get extended pointer on root and lock for local process list in cluster
  • trunk/kernel/kern/cluster.h

    r635 r637  
    44 * authors  Ghassan Almaless (2008,2009,2010,2011,2012)
    55 *          Mohamed Lamine Karaoui (2015)
    6  *          Alain Greiner (2016,2017,2018)
     6 *          Alain Greiner (2016,2017,2018,2019)
    77 *
    88 * Copyright (c) UPMC Sorbonne Universites
     
    112112    uint32_t        nb_fbf_channels;   /*! number of FBF channels                         */
    113113
    114     char            cluster_info[CONFIG_MAX_CLUSTERS_X][CONFIG_MAX_CLUSTERS_Y];
     114    // number of cores for each cluster in the mesh
     115    uint8_t         cluster_info[CONFIG_MAX_CLUSTERS_X][CONFIG_MAX_CLUSTERS_Y];
    115116
    116117    // local parameters
     
    162163 * in the local boot-info structure <info> build by the boot-loader.
    163164 * 1) the cluster_info_init() function is called first, to initialize the structural
    164  *    constants, and cannot use the TXT0 kernel terminal.
    165  * 2) the cluster_manager_init() function initialize various complex structures:
     165 *    constants, including the cluster_info[x][y] array.
     166 *    It cannot use the TXT0 kernel terminal.
     167 * 2) the cluster_manager_init() function initializes various complex structures:
    166168 *    - the local DQDT nodes,
    167169 *    - the PPM, KHM, and KCM allocators,
     
    169171 *    - the local RPC FIFO,
    170172 *    - the process manager.
    171  *    It does NOT initialise the local device descriptors.
    172173 *    It can use the TXT0 kernel terminal.
    173174 ******************************************************************************************
     
    178179
    179180/******************************************************************************************
    180  * This function checks the validity of a cluster identifier.
    181  ******************************************************************************************
    182  * @ cxy    : cluster identifier to be checked.
    183  * @ returns true if the identified cluster does not exist.
    184  *****************************************************************************************/
    185 bool_t cluster_is_undefined( cxy_t cxy );
    186 
    187 /******************************************************************************************
    188  * This function uses the local cluster_info[][] array in cluster descriptor,
    189  * and returns true when the cluster identified by the <cxy> argument is active.
    190  ******************************************************************************************
    191  * @ cxy   : cluster identifier.
     181 * This debug function displays the current values stored in the cluster_info[][] array
     182 * of a remote cluster identified by the <cxy> argument.
     183 * It can be called by a thread running in any cluster.
     184 ******************************************************************************************
     185 * @ cxy   : remote cluster identifier.
     186 *****************************************************************************************/
     187void cluster_info_display( cxy_t  cxy );
     188
     189/******************************************************************************************
     190 * This function access the local cluster_info[][] array and returns true when the
     191 * cluster identified by the <cxy> argument is active (contains a kernel instance).
     192 ******************************************************************************************
     193 * @ cxy   : checked cluster identifier.
    192194 * @ return true if cluster contains a kernel instance.
    193195 *****************************************************************************************/
     
    300302 * This function displays on the kernel terminal TXT0 all user processes registered
    301303 * in the cluster defined by the <cxy> argument.
    302  * It can be called by a thread running in any cluster, because is use remote accesses
    303  * to scan the xlist of registered processes.
     304 * It can be called by a thread running in any cluster.
    304305 ******************************************************************************************
    305306 * @ cxy   : cluster identifier.
     
    310311
    311312/******************************************************************************************
    312  * This function uses the local boot_inforeturns the core local index that has the lowest usage in local cluster.
    313  *****************************************************************************************/
    314 lid_t cluster_select_local_core( void );
     313 * This function selects the core that has the lowest usage in a - possibly remote -
     314 * cluster identified by the <cxy> argument.
     315 * It can be called by a thread running in any cluster.
     316 ******************************************************************************************
     317 * @ cxy    : target cluster identifier.
     318 * @ return the selected core local index.
     319 *****************************************************************************************/
     320lid_t cluster_select_local_core( cxy_t  cxy );
    315321
    316322             
  • trunk/kernel/kern/do_syscall.c

    r626 r637  
    9595
    9696    sys_get_config,         // 40
    97     sys_get_core,           // 41
     97    sys_get_core_id,        // 41
    9898    sys_get_cycle,          // 42
    9999    sys_display,            // 43
     
    108108    sys_sync,               // 51
    109109    sys_fsync,              // 52
     110    sys_get_best_core,      // 53
     111    sys_get_nb_cores,       // 54
    110112};
    111113
     
    160162
    161163    case SYS_GET_CONFIG:                   return "GET_CONFIG";       // 40
    162     case SYS_GET_CORE:                     return "GET_CORE";         // 41
     164    case SYS_GET_CORE_ID:                  return "GET_CORE_ID";      // 41
    163165    case SYS_GET_CYCLE:                    return "GET_CYCLE";        // 42
    164166    case SYS_DISPLAY:                      return "DISPLAY";          // 43
     
    172174    case SYS_EXIT:                         return "EXIT";             // 50
    173175    case SYS_SYNC:                         return "SYNC";             // 51
    174     case SYS_FSYNC:                        return "FSYNc";            // 52
     176    case SYS_FSYNC:                        return "FSYNC";            // 52
     177    case SYS_GET_BEST_CORE:                return "GET_BEST_CORE";    // 53
     178    case SYS_GET_NB_CORES:                 return "GET_NB_CORES";     // 54
    175179
    176180    default:                               return "undefined";
  • trunk/kernel/kern/dqdt.c

    r632 r637  
    22 * dqdt.c - Distributed Quaternary Decision Tree implementation.
    33 *
    4  * Author : Alain Greiner (2016,2017,2018)
     4 * Author : Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c)  UPMC Sorbonne Universites
     
    5555
    5656    // display node content
    57         nolock_printk("- level %d / cluster %x : threads = %x / pages = %x / clusters %d / cores %d\n",
    58     node.level, GET_CXY( node_xp ), node.threads, node.pages, node.clusters, node.cores );
     57        nolock_printk("- [%d,%x] : threads %x / pages %x / clusters %d / cores %d / parent_cxy %x\n",
     58                  node.level, GET_CXY( node_xp ),
     59                  node.threads, node.pages,
     60                  node.clusters, node.cores,
     61                  GET_CXY( node.parent ) );
    5962
    6063    // recursive call on children if node is not terminal
     
    116119                                  xptr_t   parent_xp )
    117120{
    118     assert( (level < 5) , __FUNCTION__, "illegal DQDT level %d\n", level );
     121    assert( (level <= 5) , __FUNCTION__, "illegal DQDT level %d\n", level );
    119122 
    120123    uint32_t node_x;         // node X coordinate
     
    147150
    148151#if DEBUG_DQDT_INIT
    149 printk("\n[DBG] %s : cxy(%d,%d) / level %d / mask %x / half %d / ptr %x\n",
     152printk("\n[%s] thread[%x,%x] : cxy(%d,%d) / level %d / mask %x / half %d / ptr %x\n",
    150153__FUNCTION__, node_x, node_y, level, mask, half, node_ptr );
    151154#endif
     
    336339void dqdt_init( void )
    337340{
    338     // get x_size & y_size from cluster manager
    339     cluster_t * cluster = &cluster_manager;
     341    // get x_size & y_size
     342    cluster_t * cluster = LOCAL_CLUSTER;
    340343    uint32_t    x_size  = cluster->x_size;
    341344    uint32_t    y_size  = cluster->y_size;
     
    349352    uint32_t  level_max  = bits_log2( size_ext );
    350353
    351     // each CP0 register the DQDT root in local cluster manager
     354    // all CP0s register the DQDT root in local cluster manager
    352355    cluster->dqdt_root_xp = XPTR( 0 , &cluster->dqdt_tbl[level_max] );
    353356
     357    // only CP0 in cluster 0 build the DQDT
     358    if( local_cxy == 0 )
     359    {
     360
    354361#if DEBUG_DQDT_INIT
    355 if( local_cxy == 0 )
    356 printk("\n[DBG] %s : x_size = %d / y_size = %d / level_max = %d\n",
    357 __FUNCTION__, x_size, y_size, level_max );
     362thread_t * this = CURRENT_THREAD;
     363printk("\n[%s] thread[%x,%x] enters : x_size = %d / y_size = %d / level_max = %d\n",
     364__FUNCTION__, this->process->pid, this->trdid, x_size, y_size, level_max );
    358365#endif
    359366   
     
    362369
    363370#if DEBUG_DQDT_INIT
    364 if( local_cxy == 0 ) dqdt_display();
    365 #endif
    366 
     371dqdt_display();
     372#endif
     373
     374    }
    367375}  // end dqdt_init()
    368376
     
    516524}
    517525
     526///////////////////////////////////
     527xptr_t dqdt_get_root( cxy_t    cxy,
     528                      uint32_t level )
     529{
     530    xptr_t        node_xp;
     531    cxy_t         node_cxy;
     532    dqdt_node_t * node_ptr;
     533    uint32_t      current_level;
     534
     535assert( (level <= 5) , __FUNCTION__, "illegal DQDT level %d\n", level );
     536
     537#if DEBUG_DQDT_GET_ROOT
     538thread_t * this = CURRENT_THREAD;
     539printk("\n[%s] thread[%x,%x] enters / cxy %x / level %d\n",
     540__FUNCTION__, this->process->pid, this->trdid, cxy, level );
     541#endif
     542
     543    // check macro-cluster
     544    if( cluster_is_active( cxy ) )
     545    {   
     546        // initialise node_xp and current_level
     547        node_xp       = XPTR( cxy , &LOCAL_CLUSTER->dqdt_tbl[0] );
     548        current_level = 0;
     549
     550        // traverse the quad-tree from bottom to root
     551        while( current_level < level )
     552        {
     553            node_cxy = GET_CXY( node_xp );
     554            node_ptr = GET_PTR( node_xp );
     555
     556            node_xp = hal_remote_l64( XPTR( node_cxy , &node_ptr->parent ) );
     557            current_level++;
     558        }
     559    }
     560    else
     561    {
     562        node_xp =  XPTR_NULL;
     563    }
     564
     565#if DEBUG_DQDT_GET_ROOT
     566printk("\n[%s] thread[%x,%x] exit / root_xp[%x,%x]\n",
     567__FUNCTION__, this->process->pid, this->trdid, GET_CXY( node_xp ), GET_PTR( node_xp ) );
     568#endif
     569
     570    return node_xp;
     571   
     572}
    518573
    519574/////////////////////////////////////////////////////////////////////////////////////
     
    584639
    585640
    586 //////////////////////////////////////////
    587 cxy_t dqdt_get_cluster_for_process( void )
     641///////////////////////////////////////////////////
     642cxy_t dqdt_get_cluster_for_thread( xptr_t root_xp )
    588643{
    589644    // call recursive function
    590     cxy_t cxy = dqdt_select_cluster( LOCAL_CLUSTER->dqdt_root_xp , false );
    591 
    592 #if DEBUG_DQDT_SELECT_FOR_PROCESS
     645    cxy_t cxy = dqdt_select_cluster( root_xp , false );
     646
     647#if DEBUG_DQDT_SELECT_FOR_THREAD
    593648uint32_t cycle = hal_get_cycles();
    594649if( cycle > DEBUG_DQDT_SELECT_FOR_PROCESS )
     
    600655}
    601656
    602 /////////////////////////////////////////
    603 cxy_t dqdt_get_cluster_for_memory( void )
     657///////////////////////////////////////////////////
     658cxy_t dqdt_get_cluster_for_memory( xptr_t root_xp )
    604659{
    605660    // call recursive function
    606     cxy_t cxy = dqdt_select_cluster( LOCAL_CLUSTER->dqdt_root_xp , true );
     661    cxy_t cxy = dqdt_select_cluster( root_xp , true );
    607662
    608663#if DEBUG_DQDT_SELECT_FOR_MEMORY
  • trunk/kernel/kern/dqdt.h

    r632 r637  
    22 * kern/dqdt.h - Distributed Quad Decision Tree
    33 *
    4  * Author : Alain Greiner (2016,2017,2018)
     4 * Author : Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c)  UPMC Sorbonne Universites
     
    3131/****************************************************************************************
    3232 * This DQDT infrastructure maintains a topological description of ressources usage
    33  * in each cluster: number of threads, and number of physical pages allocated.
     33 * in each cluster: number of threads per core, and number of physical pages allocated.
    3434 *
    35  * - If X_SIZE or Y_SIZE are equal to 1, it makes the assumption that the cluster
    36  *   topology is a one dimensionnal vector, an build the smallest one-dimensionnal
    37  *   quad-tree covering this one-dimensionnal vector. If the number of clusters
    38  *   is not a power of 4, the tree is truncated as required.
    39  *
    40  *   TODO : the mapping for the one dimensionnal topology is not implemented yet [AG].
    41  *
    42  * - If both Y_SIZE and Y_SIZE are larger than 1, it makes the assumption that
    43  *   the clusters topology is a 2D mesh. The [X,Y] coordinates of a cluster are
    44  *   obtained from the CXY identifier using the Rrelevant macros.
    45  *      X = CXY >> Y_WIDTH   /  Y = CXY & ((1<<Y_WIDTH)-1)
    46  * - If the mesh X_SIZE and Y_SIZE dimensions are not equal, or are not power of 2,
    47  *   or the mesh contains "holes" reported in the cluster_info[x][y] array,
    48  *   we build the smallest two dimensionnal quad-tree covering all clusters,
    49  *   and this tree is truncated as required.
    50  * - The mesh size is supposed to contain at most 32 * 32 clusters.
    51  *   Therefore, it can exist at most 6 DQDT nodes in a given cluster:
    52  *   . Level 0 nodes exist on all clusters and have no children.
    53  *   . Level 1 nodes exist when both X and Y coordinates are multiple of 2
    54  *   . Level 2 nodes exist when both X and Y coordinates are multiple of 4
    55  *   . Level 3 nodes exist when both X and Y coordinates are multiple of 8
    56  *   . Level 4 nodes exist when both X and Y coordinates are multiple of 16
    57  *   . Level 5 nodes exist when both X and Y coordinates are multiple of 32
    58  * - For nodes other than level 0, the placement is defined as follow:
    59  *   . The root node is placed in the cluster containing the core executing
    60  *     the dqdt_init() function.
    61  *   . An intermediate node (representing a given sub-tree) is placed in one
    62  *     cluster covered by the subtree, pseudo-randomly selected.
     35 * It is organized as a quad-tree, where the leaf cells are the clusters, organised
     36 * as a 2D mesh. Each node in the quad-tree (including the root and the leaf cells,
     37 * covers a "macro-cluster", that is a square array of clusters where the number
     38 * in the macro-cluster is a power of 4, and the macro-cluster side is a power of two.
     39 * Each node contains informations on ressources usage (physical memory and cores)
     40 * in the covered macro-cluster.
     41 * This quad-tree can be truncated, if the physical mesh X_SIZE and Y_SIZE dimensions
     42 * are not equal, or are not power of 2, or if the physical mesh contains "holes".
     43 * The mesh size is supposed to contain at most 32*32 clusters in this implementation.
     44 *   . Level 0 nodes exist in all clusters and have no children.
     45 *   . Level 1 nodes can be placed in any cluster of the covered  2*2  macro-cluster.
     46 *   . Level 2 nodes can be placed in any cluster of the covered  4*4  macro-cluster.
     47 *   . Level 3 nodes can be placed in any cluster of the covered  8*8  macro-cluster.
     48 *   . Level 4 nodes can be placed in any cluster of the covered 16*16 macro-cluster.
     49 *   . Level 5 nodes can be placed in any cluster of the covered 32*32 macro-cluster.
     50 * The root node is placed in the cluster containing the core executing the dqdt_init()
     51 * function. Other (non level 0) nodes are placed pseudo-randomly.
    6352 ***************************************************************************************/
    6453
     
    6655 * This structure describes a node of the DQDT.
    6756 * The max number of children is 4, but it can be smaller for some nodes.
    68  * Level 0 nodes are the clusters, and have no children.
    69  * The root node has no parent.
     57 * Level 0 nodes have no children. The root node has no parent.
    7058 ***************************************************************************************/
    7159
     
    7462        uint32_t      level;            /*! node level                                     */
    7563        uint32_t      arity;            /*! actual children number in this node            */
    76     uint32_t      threads;          /*! current number of threads in macro-cluster     */
    77     uint32_t      pages;            /*! current number of pages in macro-cluster       */
     64    uint32_t      threads;          /*! number of threads in macro-cluster             */
     65    uint32_t      pages;            /*! number of allocated pages in macro-cluster     */
    7866    uint32_t      cores;            /*! number of active cores in macro cluster        */
    79     uint32_t      clusters;         /*! number of active cluster in macro cluster      */
     67    uint32_t      clusters;         /*! number of active clusters in macro cluster     */
    8068        xptr_t        parent;           /*! extended pointer on parent node                */
    8169        xptr_t        children[2][2];   /*! extended pointers on children nodes            */
     
    8775 * This function recursively initializes the DQDT structure from informations
    8876 * stored in cluster manager (x_size, y_size and cluster_info[x][y].
    89  * It is executed in all clusters by the local CP0, to compute level_max and register
     77 * It is called in all clusters by the local CP0, to compute level_max and register
    9078 * the DQDT root node in each cluster manager, but only CPO in cluster 0 build actually
    9179 * the quad-tree covering all active clusters.
     
    10290 ***************************************************************************************/
    10391void dqdt_increment_threads( void );
     92
    10493void dqdt_decrement_threads( void );
    10594
     
    121110
    122111/****************************************************************************************
    123  * This function can be called in any cluster. It traverses the DQDT tree
    124  * from the root to the bottom, to analyse the computing load and select the cluster
    125  * with the lowest number ot threads to place a new process.
     112 * This function returns an extended pointer on the dqdt node that is the root of
     113 * the sub-tree covering the macro-cluster defined by the <level> argument and
     114 * containing the cluster defined by the <cxy> argument. It returns XPTR_NULL if
     115 * this macro-cluster is undefined (when the cxy cluster contains no core).
    126116 ****************************************************************************************
     117 * @ cxy   : cluster identifier.
     118 * @ level   : level of the sub-tree.
     119 * @ returns  root_xp if success / return XPTR_NULL if no active core in macro_cluster.
     120 ***************************************************************************************/
     121xptr_t dqdt_get_root( cxy_t    cxy,
     122                      uint32_t level );
     123
     124/****************************************************************************************
     125 * This function can be called in any cluster. It traverses the DQDT tree from the
     126 * local root of a macro-cluster, defined by the <root_xp> argument, to the bottom.
     127 * It analyses the computing load & select the cluster containing the lowest number
     128 * ot threads.
     129 ****************************************************************************************
     130 * @ root_xp  : extended pointer on DQDT node root.
    127131 * @ returns the cluster identifier with the lowest computing load.
    128132 ***************************************************************************************/
    129 cxy_t dqdt_get_cluster_for_process( void );
     133cxy_t dqdt_get_cluster_for_thread( xptr_t root_xp );
    130134
    131135/****************************************************************************************
    132  * This function can be called in any cluster. It traverses the DQDT tree
    133  * from the root to the bottom, to analyse the memory load and select the cluster
    134  * with the lowest memory load for dynamic memory allocation with no locality constraint.
     136 * This function can be called in any cluster. It traverses the DQDT tree from the
     137 * local root of a macro-cluster, defined by the <root_xp> argument, to the bottom.
     138 * It analyses the memory load & select the cluster with the lowest number of allocated
     139 * physical pages.
    135140 ****************************************************************************************
     141 * @ root_xp  : extended pointer on DQDT node root.
    136142 * @ returns the cluster identifier with the lowest memory load.
    137143 ***************************************************************************************/
    138 cxy_t dqdt_get_cluster_for_memory( void );
     144cxy_t dqdt_get_cluster_for_memory( xptr_t root_xp );
    139145
    140146/****************************************************************************************
    141147 * This function displays on kernel TXT0 the DQDT state for all nodes in the quad-tree.
    142  * It traverses the quadtree from root to bottom, and can be called by a thread
    143  * running in any cluster
     148 * It traverses the quadtree from the global root to bottom.
     149 * It can be called by a thread running in any cluster
    144150 ***************************************************************************************/
    145151void dqdt_display( void );
  • trunk/kernel/kern/kernel_init.c

    r635 r637  
    10081008
    10091009    /////////////////////////////////////////////////////////////////////////////////
    1010     // STEP 2 : core[0] initializes the cluter manager,
    1011     //          including the physical memory allocator.
     1010    // STEP 2 : core[0] initializes the cluster manager,
     1011    //          including the physical memory allocators.
    10121012    /////////////////////////////////////////////////////////////////////////////////
    10131013
     
    11021102
    11031103    ////////////////////////////////////////////////////////////////////////////////
    1104     // STEP 5 : core[0] initializes the distibuted LAPIC descriptor.
    1105     //          core[0] initializes the internal chdev descriptors
     1104    // STEP 5 : core[0] initialize the distibuted LAPIC descriptor.
     1105    //          core[0] initialize the internal chdev descriptors
    11061106    //          core[0] initialize the local external chdev descriptors
    11071107    ////////////////////////////////////////////////////////////////////////////////
  • trunk/kernel/kern/process.c

    r635 r637  
    19091909
    19101910    // select a core in local cluster to execute the main thread
    1911     lid  = cluster_select_local_core();
     1911    lid  = cluster_select_local_core( local_cxy );
    19121912
    19131913    // initialize pthread attributes for main thread
  • trunk/kernel/kern/rpc.c

    r635 r637  
    10531053
    10541054    // select one core
    1055     core_lid = cluster_select_local_core();
     1055    core_lid = cluster_select_local_core( local_cxy );
    10561056
    10571057    // call local kernel function
  • trunk/kernel/kern/scheduler.h

    r564 r637  
    4141{
    4242    busylock_t        lock;            /*! lock protecting scheduler state                  */
    43     uint16_t          u_threads_nr;    /*! total number of attached user threads            */
    44     uint16_t          k_threads_nr;    /*! total number of attached kernel threads          */
     43    uint32_t          u_threads_nr;    /*! total number of attached user threads            */
     44    uint32_t          k_threads_nr;    /*! total number of attached kernel threads          */
    4545    list_entry_t      u_root;          /*! root of list of user threads                     */
    4646    list_entry_t      k_root;          /*! root of list of kernel threads                   */
  • trunk/kernel/kern/thread.c

    r635 r637  
    247247    else
    248248    {
    249         core_lid = cluster_select_local_core();
     249        core_lid = cluster_select_local_core( local_cxy );
    250250    }
    251251
     
    375375printk("\n[%s] CPU & FPU contexts created\n",
    376376__FUNCTION__, thread->trdid );
    377 hal_vmm_display( process , true );
     377hal_vmm_display( XPTR( local_cxy , process ) , true );
    378378#endif
    379379
     
    418418
    419419    // select a target core in local cluster
    420     core_lid = cluster_select_local_core();
     420    core_lid = cluster_select_local_core( local_cxy );
    421421
    422422#if (DEBUG_THREAD_USER_FORK & 1)
     
    724724printk("\n[%s] thread[%x,%x] set CPU context & jump to user code / cycle %d\n",
    725725__FUNCTION__, process->pid, thread->trdid, cycle );
    726 hal_vmm_display( process , true );
     726hal_vmm_display( XPTR( local_cxy , process ) , true );
    727727#endif
    728728
     
    13321332    // check trdid argument
    13331333        if( (target_thread_ltid >= CONFIG_THREADS_MAX_PER_CLUSTER) ||
    1334         cluster_is_undefined( target_cxy ) )         return XPTR_NULL;
     1334        cluster_is_active( target_cxy ) == false )                return XPTR_NULL;
    13351335
    13361336    // get root of list of process descriptors in target cluster
  • trunk/kernel/kernel_config.h

    r635 r637  
    6868#define DEBUG_ELF_LOAD                    0
    6969
     70#define DEBUG_DQDT_GET_ROOT               0
    7071#define DEBUG_DQDT_INIT                   0
     72#define DEBUG_DQDT_SELECT_FOR_THREAD      0
     73#define DEBUG_DQDT_SELECT_FOR_MEMORY      0
     74#define DEBUG_DQDT_UPDATE_PAGES           0
    7175#define DEBUG_DQDT_UPDATE_THREADS         0
    72 #define DEBUG_DQDT_SELECT_FOR_PROCESS     0
    73 #define DEBUG_DQDT_UPDATE_PAGES           0
    74 #define DEBUG_DQDT_SELECT_FOR_MEMORY      0
    7576
    7677#define DEBUG_FATFS_ADD_DENTRY            0
     
    170171#define DEBUG_RWLOCK_CXY                  0
    171172
    172 #define DEBUG_SCHED_HANDLE_SIGNALS        2
     173#define DEBUG_SCHED_HANDLE_SIGNALS        0
    173174#define DEBUG_SCHED_YIELD                 0
    174175#define DEBUG_SCHED_RPC_ACTIVATE          0
     
    176177#define DEBUG_SEM                         0
    177178
    178 #define DEBUG_SYSCALLS_ERROR              2
     179#define DEBUG_SYSCALLS_ERROR                  2
    179180
    180181#define DEBUG_SYS_BARRIER                 0
     
    190191#define DEBUG_SYS_GETCWD                  0
    191192#define DEBUG_SYS_GETPID                  0
     193#define DEBUG_SYS_GET_BEST_CORE           0
     194#define DEBUG_SYS_GET_CORE_ID             0
     195#define DEBUG_SYS_GET_NB_CORES            0
    192196#define DEBUG_SYS_ISATTY                  0
    193197#define DEBUG_SYS_IS_FG                   0
     
    456460
    457461#define CONFIG_INSTRUMENTATION_SYSCALLS    0
    458 #define CONFIG_INSTRUMENTATION_PGFAULTS    1
    459 #define CONFIG_INSTRUMENTATION_FOOTPRINT   1
     462#define CONFIG_INSTRUMENTATION_PGFAULTS    0
     463#define CONFIG_INSTRUMENTATION_FOOTPRINT   0
    460464
    461465
  • trunk/kernel/mm/mapper.c

    r635 r637  
    442442        if ( page_xp == XPTR_NULL ) return -1;
    443443
    444         // compute cluster and pointers on page in mapper
    445         xptr_t     map_xp  = ppm_page2base( page_xp );
    446         uint8_t  * map_ptr = GET_PTR( map_xp );
    447         cxy_t      map_cxy = GET_CXY( map_xp );
     444        // compute extended pointer in kernel mapper
     445        xptr_t     map_xp  = ppm_page2base( page_xp ) + page_offset;
    448446
    449447#if (DEBUG_MAPPER_MOVE_USER & 1)
     
    458456        if( to_buffer )
    459457        {
    460             hal_copy_to_uspace( map_cxy , map_ptr + page_offset , buf_ptr , page_bytes );
     458            hal_copy_to_uspace( buf_ptr , map_xp , page_bytes );
    461459
    462460#if DEBUG_MAPPER_MOVE_USER & 1
     
    464462printk("\n[%s] thread[%x,%x] moved %d bytes / mapper %s (%x,%x) -> user buffer(%x,%x)\n",
    465463__FUNCTION__, this->process->pid, this->trdid, page_bytes,
    466 name, map_cxy, map_ptr + page_offset, local_cxy, buf_ptr );
     464name, GET_CXY(map_xp), GET_PTR(map_xp), local_cxy, buf_ptr );
    467465#endif
    468466
     
    471469        {
    472470            ppm_page_do_dirty( page_xp );
    473             hal_copy_from_uspace( map_cxy , map_ptr + page_offset , buf_ptr , page_bytes );
     471            hal_copy_from_uspace( map_xp , buf_ptr , page_bytes );
    474472
    475473#if DEBUG_MAPPER_MOVE_USER & 1
     
    477475printk("\n[%s] thread[%x,%x] moved %d bytes / user buffer(%x,%x) -> mapper %s (%x,%x)\n",
    478476__FUNCTION__, this->process->pid, this->trdid, page_bytes,
    479 local_cxy, buf_ptr, name, map_cxy, map_ptr + page_offset );
     477local_cxy, buf_ptr, name, GET_CXY(map_xp), GET_PTR(map_xp) );
    480478mapper_display_page(  mapper_xp , page_id, 128 );
    481479#endif
  • trunk/kernel/mm/ppm.c

    r636 r637  
    533533    page_xp = XPTR( page_cxy , page_ptr );
    534534   
    535 
    536535    // get local pointer on PPM (same in all clusters)
    537536        ppm_t * ppm = &LOCAL_CLUSTER->ppm;
     
    568567                buddy_index = current_index ^ (1 << current_order);
    569568                buddy_ptr   = pages_tbl + buddy_index;
     569
     570        // get buddy order
     571        buddy_order = hal_remote_l32( XPTR( page_cxy , &buddy_ptr->order ) );
    570572       
    571573        // exit loop if buddy not found
  • trunk/kernel/syscalls/shared_include/shared_mman.h

    r623 r637  
    5151typedef struct mmap_attr_s
    5252{
    53         void         * addr;       /*! requested virtual address (unused : should be NULL)    */
     53        void         * addr;       /*! buffer for allocated vseg base address (return value)  */
    5454        unsigned int   length;     /*! requested vseg size (bytes)                            */
    5555        unsigned int   prot;       /*! access modes                                           */
  • trunk/kernel/syscalls/shared_include/syscalls_numbers.h

    r626 r637  
    2929 * It must be kept consistent with the array defined in do_syscalls.c
    3030 *****************************************************************************************/
    31 typedef enum {
     31typedef enum
     32{
    3233    SYS_THREAD_EXIT    = 0,
    3334    SYS_THREAD_YIELD   = 1,
     
    7576
    7677    SYS_GET_CONFIG     = 40,
    77     SYS_GET_CORE       = 41,
     78    SYS_GET_CORE_ID    = 41,
    7879    SYS_GET_CYCLE      = 42,
    7980    SYS_DISPLAY        = 43,
     
    8889    SYS_SYNC           = 51,
    8990    SYS_FSYNC          = 52,
     91    SYS_GET_BEST_CORE  = 53,
     92    SYS_GET_NB_CORES   = 54,
    9093
    91     SYSCALLS_NR        = 53,
     94    SYSCALLS_NR        = 55,
    9295
    9396} syscalls_t;
  • trunk/kernel/syscalls/sys_barrier.c

    r635 r637  
    3333#include <remote_barrier.h>
    3434
    35 #if DEBUG_SYS_BARRIER
    3635//////////////////////////////////////////////////////
    3736static char * sys_barrier_op_str( uint32_t operation )
     
    4241        else                                    return "undefined";
    4342}
    44 #endif
    4543
    4644//////////////////////////////////
     
    7472
    7573#if DEBUG_SYSCALLS_ERROR
    76 printk("\n[ERROR] in %s : unmapped barrier %x / thread %x / process %x\n",
    77 __FUNCTION__ , vaddr , this->trdid , process->pid );
     74printk("\n[ERROR] in %s for %s : unmapped barrier %x / thread[%x,%x]\n",
     75__FUNCTION__, sys_barrier_op_str(operation), vaddr, process->pid, this->trdid );
    7876#endif
    7977        this->errno = error;
     
    9492
    9593#if DEBUG_SYSCALLS_ERROR
    96 printk("\n[ERROR] in %s : unmapped barrier attributes %x / thread %x / process %x\n",
    97 __FUNCTION__ , attr , this->trdid , process->pid );
     94printk("\n[ERROR] in %s for INIT : unmapped barrier attributes %x / thread[%x,%x]\n",
     95__FUNCTION__ , attr , process->pid , this->trdid );
    9896#endif
    9997                    this->errno = EINVAL;
     
    102100 
    103101                // copy barrier attributes into kernel space
    104                 hal_copy_from_uspace( local_cxy,
    105                                       &k_attr,
    106                                       (void*)attr,
     102                hal_copy_from_uspace( XPTR( local_cxy , &k_attr ),
     103                                      (void *)attr,
    107104                                      sizeof(pthread_barrierattr_t) );
    108105
     
    111108
    112109#if DEBUG_SYSCALLS_ERROR
    113 printk("\n[ERROR] in %s : wrong arguments / count %d / x_size %d / y_size %d / nthreads %x\n",
     110printk("\n[ERROR] in %s for INIT : count (%d) != x_size (%d) * y_size (%d) * nthreads (%x)\n",
    114111__FUNCTION__, count, k_attr.x_size, k_attr.y_size, k_attr.nthreads );
    115112#endif
     
    131128
    132129#if DEBUG_SYSCALLS_ERROR
    133 printk("\n[ERROR] in %s : cannot create barrier %x / thread %x / process %x\n",
    134 __FUNCTION__ , vaddr , this->trdid , process->pid );
     130printk("\n[ERROR] in %s for INIT : cannot create barrier %x / thread[%x,%x]\n",
     131__FUNCTION__ , vaddr , process->pid , this->trdid );
    135132#endif
    136133                this->errno = ENOMEM;
     
    148145
    149146#if DEBUG_SYSCALLS_ERROR
    150 printk("\n[ERROR] in %s : barrier %x not registered / thread %x / process %x\n",
    151 __FUNCTION__ , (intptr_t)vaddr , this->trdid , process->pid );
     147printk("\n[ERROR] in %s for WAIT : barrier %x not registered / thread[%x,%x]\n",
     148__FUNCTION__ , (intptr_t)vaddr , process->pid, this->trdid );
    152149#endif
    153150                this->errno = EINVAL;
     
    169166
    170167#if DEBUG_SYSCALLS_ERROR
    171 printk("\n[ERROR] in %s : barrier %x not registered / thread %x / process %x\n",
    172 __FUNCTION__ , (intptr_t)vaddr , this->trdid , process->pid );
     168printk("\n[ERROR] in %s for DESTROY : barrier %x not registered / thread[%x,%x]\n",
     169__FUNCTION__ , (intptr_t)vaddr , process->pid, this->trdid );
    173170#endif
    174171                this->errno = EINVAL;
  • trunk/kernel/syscalls/sys_chdir.c

    r610 r637  
    22 * sys_chdir.c - kernel function implementing the "chdir" syscall.
    33 *
    4  * Author    Alain Greiner (2016,2017,2018)
     4 * Author    Alain Greiner (2016,2017,2018, 2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
     
    7575
    7676    // copy pathname in kernel space
    77     hal_strcpy_from_uspace( kbuf , pathname , CONFIG_VFS_MAX_PATH_LENGTH );
     77    hal_strcpy_from_uspace( XPTR( local_cxy , kbuf ),
     78                            pathname,
     79                            CONFIG_VFS_MAX_PATH_LENGTH );
    7880
    7981#if DEBUG_SYS_CHDIR
  • trunk/kernel/syscalls/sys_chmod.c

    r566 r637  
    22 * sys_chmod.c - Change file access rights.
    33 *
    4  * Author    Alain Greiner  (2016,2017)
     4 * Author    Alain Greiner  (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) 2015 UPMC Sorbonne Universites
     
    4747
    4848#if DEBUG_SYSCALLS_ERROR
    49         printk("\n[ERROR] in %s : pathname too long / thread %x in process %x\n",
    50         __FUNCTION__, this->trdid, process->pid );
     49printk("\n[ERROR] in %s : pathname too long / thread %x in process %x\n",
     50__FUNCTION__, this->trdid, process->pid );
    5151#endif
    5252        this->errno = ENFILE;
     
    5555
    5656    // copy pathname in kernel space
    57     hal_strcpy_from_uspace( kbuf , pathname , CONFIG_VFS_MAX_PATH_LENGTH );
     57    hal_strcpy_from_uspace( XPTR( local_cxy , kbuf ),
     58                            pathname,
     59                            CONFIG_VFS_MAX_PATH_LENGTH );
    5860
    5961    printk("\n[ERROR] in %s : not implemented yet\n", __FUNCTION__ );
  • trunk/kernel/syscalls/sys_display.c

    r635 r637  
    122122
    123123            // copy string to kernel space
    124             hal_strcpy_from_uspace( kbuf , string , 512 );
     124            hal_strcpy_from_uspace( XPTR( local_cxy , kbuf ),
     125                                    string,
     126                                    512 );
    125127
    126128            // print message on TXT0 kernel terminal
     
    136138
    137139            // check cxy argument
    138                 if( cluster_is_undefined( cxy ) )
     140                if( cluster_is_active( cxy ) == false )
    139141            {
    140142
     
    172174
    173175            // check cxy argument
    174                 if( cluster_is_undefined( cxy ) )
     176                if( cluster_is_active( cxy ) == false )
    175177            {
    176178
     
    213215
    214216            // check cxy argument
    215                 if( cluster_is_undefined( cxy ) )
     217                if( cluster_is_active( cxy ) == false )
    216218            {
    217219
     
    323325
    324326            // copy pathname in kernel space
    325             hal_strcpy_from_uspace( kbuf , path , CONFIG_VFS_MAX_PATH_LENGTH );
     327            hal_strcpy_from_uspace( XPTR( local_cxy , kbuf ),
     328                                    path,
     329                                    CONFIG_VFS_MAX_PATH_LENGTH );
    326330
    327331            // compute root inode for pathname
     
    447451                uint32_t  cxy = (uint32_t)arg0;
    448452
    449                 if( cluster_is_undefined( cxy ) )
     453                if( cluster_is_active( cxy ) == false )
    450454                {
    451455
  • trunk/kernel/syscalls/sys_exec.c

    r635 r637  
    8989
    9090    // copy the array of pointers to kernel buffer
    91     hal_copy_from_uspace( local_cxy,
    92                           k_pointers,
     91    hal_copy_from_uspace( XPTR( local_cxy , k_pointers ),
    9392                          u_pointers,
    9493                          CONFIG_PPM_PAGE_SIZE );
     
    109108
    110109        // copy the user string to kernel buffer
    111         hal_copy_from_uspace( local_cxy,
    112                               k_buf_ptr,
     110        hal_copy_from_uspace( XPTR( local_cxy , k_buf_ptr ),
    113111                              k_pointers[index],
    114112                              length );
     
    199197
    200198    // copy pathname in exec_info structure (kernel space)
    201     hal_strcpy_from_uspace( exec_info.path , pathname , CONFIG_VFS_MAX_PATH_LENGTH );
     199    hal_strcpy_from_uspace( XPTR( local_cxy , exec_info.path ),
     200                            pathname,
     201                            CONFIG_VFS_MAX_PATH_LENGTH );
    202202
    203203#if DEBUG_SYS_EXEC
  • trunk/kernel/syscalls/sys_fork.c

    r635 r637  
    105105        else                                  // DQDT placement
    106106        {
    107                 child_cxy = dqdt_get_cluster_for_process();
     107                child_cxy = dqdt_get_cluster_for_thread( LOCAL_CLUSTER->dqdt_root_xp );
    108108        }
    109109
  • trunk/kernel/syscalls/sys_get_config.c

    r635 r637  
    108108
    109109    // copy to user space
    110         hal_copy_to_uspace( local_cxy, &k_x_size, x_size, sizeof(uint32_t) );
    111         hal_copy_to_uspace( local_cxy, &k_y_size, y_size, sizeof(uint32_t) );
    112         hal_copy_to_uspace( local_cxy, &k_ncores, ncores, sizeof(uint32_t) );
     110        hal_copy_to_uspace( x_size, XPTR( local_cxy , &k_x_size ), sizeof(uint32_t) );
     111        hal_copy_to_uspace( y_size, XPTR( local_cxy , &k_y_size ), sizeof(uint32_t) );
     112        hal_copy_to_uspace( ncores, XPTR( local_cxy , &k_ncores ), sizeof(uint32_t) );
    113113
    114114    hal_fence();
  • trunk/kernel/syscalls/sys_get_cycle.c

    r635 r637  
    4545    process_t * process = this->process;
    4646
     47#if (DEBUG_SYS_GET_CYCLE || CONFIG_INSTRUMENTATION_SYSCALLS)
     48uint64_t     tm_start = hal_get_cycles();
     49#endif
     50
    4751    // check buffer in user space
    4852    error = vmm_get_vseg( process , (intptr_t)cycle , &vseg );
     
    6367
    6468    // copy to user space
    65         hal_copy_to_uspace( local_cxy, &k_cycle, cycle, sizeof(uint64_t) );
     69        hal_copy_to_uspace( cycle,
     70                        XPTR( local_cxy , &k_cycle ),
     71                        sizeof(uint64_t) );
     72
     73#if (DEBUG_SYS_GET_CYCLE || CONFIG_INSTRUMENTATION_SYSCALLS)
     74uint64_t     tm_end = hal_get_cycles();
     75#endif
     76
     77#if DEBUG_SYS_GET_CYCLE
     78if( DEBUG_SYS_GET_CYCLE < tm_end )
     79printk("\n[%s] thread[%x,%x] exit / cycle %d\n",
     80__FUNCTION__ , process->pid, this->trdid, (uint32_t)tm_end );
     81#endif
     82
     83#if CONFIG_INSTRUMENTATION_SYSCALLS
     84hal_atomic_add( &syscalls_cumul_cost[SYS_GET_CYCLE] , tm_end - tm_start );
     85hal_atomic_add( &syscalls_occurences[SYS_GET_CYCLE] , 1 );
     86#endif
    6687
    6788        return 0;
  • trunk/kernel/syscalls/sys_getcwd.c

    r610 r637  
    22 * sys_getcwd.c - kernel function implementing the "getcwd" syscall.
    33 *
    4  * Author    Alain Greiner (2016,2017,2018)
     4 * Author    Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c)  UPMC Sorbonne Universites
     
    9797
    9898    // copy kernel buffer to user space
    99     hal_strcpy_to_uspace( buffer , first , CONFIG_VFS_MAX_PATH_LENGTH );
     99    hal_strcpy_to_uspace( buffer,
     100                          XPTR( local_cxy , first ),
     101                          CONFIG_VFS_MAX_PATH_LENGTH );
    100102
    101103    hal_fence();
  • trunk/kernel/syscalls/sys_is_fg.c

    r635 r637  
    9090
    9191    // copy to user space
    92     hal_copy_to_uspace( local_cxy, &is_txt_owner, is_fg, sizeof(uint32_t) );
     92    hal_copy_to_uspace( is_fg,
     93                        XPTR( local_cxy , &is_txt_owner ),
     94                        sizeof(uint32_t) );
    9395
    9496    hal_fence();
  • trunk/kernel/syscalls/sys_mkdir.c

    r610 r637  
    6060
    6161    // copy pathname in kernel space
    62     hal_strcpy_from_uspace( kbuf , pathname , CONFIG_VFS_MAX_PATH_LENGTH );
     62    hal_strcpy_from_uspace( XPTR( local_cxy , kbuf ),
     63                            pathname,
     64                            CONFIG_VFS_MAX_PATH_LENGTH );
    6365
    6466#if DEBUG_SYS_MKDIR
  • trunk/kernel/syscalls/sys_mkfifo.c

    r566 r637  
    22 * sys_mkfifo.c - creates a named FIFO file.
    33 *
    4  * Author    Alain Greiner (2016,2017)
     4 * Author    Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
     
    3333                 uint32_t  mode __attribute__((unused)) )
    3434{
    35     error_t        error;
    3635    char           kbuf[CONFIG_VFS_MAX_PATH_LENGTH];
    3736
     
    3938    process_t * process = this->process;
    4039
     40#if (DEBUG_SYS_MKFIFO || CONFIG_INSTRUMENTATION_SYSCALLS)
     41uint64_t     tm_start = hal_get_cycles();
     42#endif
     43
     44#if DEBUG_SYS_MKFIFO
     45if( DEBUG_SYS_MKFIFO < tm_end )
     46printk("\n[%s] thread[%x,%x] enter for <%s> / cycle %d\n",
     47__FUNCTION__, process->pid, this->trdid, pathname, (uint32_t)tm_end );
     48#endif
     49 
    4150    // check fd_array not full
    4251    if( process_fd_array_full() )
    4352    {
    44         printk("\n[ERROR] in %s : file descriptor array full for process %x\n",
    45                __FUNCTION__ , process->pid );
     53
     54#if DEBUG_SYSCALLS_ERROR
     55printk("\n[ERROR] in %s : file descriptor array full for process %x\n",
     56__FUNCTION__ , process->pid );
     57#endif
    4658        this->errno = ENFILE;
    4759        return -1;
     
    5163    if( hal_strlen_from_uspace( pathname ) >= CONFIG_VFS_MAX_PATH_LENGTH )
    5264    {
    53         printk("\n[ERROR] in %s : pathname too long\n", __FUNCTION__ );
     65
     66#if DEBUG_SYSCALLS_ERROR
     67printk("\n[ERROR] in %s : pathname too long\n", __FUNCTION__ );
     68#endif
    5469        this->errno = ENFILE;
    5570        return -1;
     
    5772
    5873    // copy pathname in kernel space
    59     hal_strcpy_from_uspace( kbuf , pathname , CONFIG_VFS_MAX_PATH_LENGTH );
     74    hal_strcpy_from_uspace( XPTR( local_cxy , kbuf ),
     75                            pathname,
     76                            CONFIG_VFS_MAX_PATH_LENGTH );
    6077
    6178    printk("\n[ERROR] in %s : not implemented yet\n", __FUNCTION__ );
    6279    return -1;
    6380
    64     if( error )
    65     {
    66         printk("\n[ERROR] in %s : cannot create named FIFO %s\n",
    67                __FUNCTION__ , kbuf );
    68         this->errno = error;
    69         return -1;
    70     }
     81#if (DEBUG_SYS_MKFIFO || CONFIG_INSTRUMENTATION_SYSCALLS)
     82uint64_t     tm_end = hal_get_cycles();
     83#endif
    7184
    72     return 0;
     85#if DEBUG_SYS_MKFIFO
     86if( DEBUG_SYS_MKFIFO < tm_end )
     87printk("\n[%s] thread[%x,%x] exit for <%s> / cycle %d\n",
     88__FUNCTION__, process->pid, this->trdid, pathname, (uint32_t)tm_end );
     89#endif
     90 
     91#if CONFIG_INSTRUMENTATION_SYSCALLS
     92hal_atomic_add( &syscalls_cumul_cost[SYS_MKFIFO] , tm_end - tm_start );
     93hal_atomic_add( &syscalls_occurences[SYS_MKFIFO] , 1 );
     94#endif
    7395
    7496} // end sys_mkfifo()
  • trunk/kernel/syscalls/sys_mmap.c

    r635 r637  
    4141{
    4242    vseg_t      * vseg;
    43     cxy_t         vseg_cxy;
    44     vseg_type_t   vseg_type;
     43    cxy_t         vseg_cxy;     // target cluster for the vseg
     44    vseg_type_t   vseg_type;    // vseg type
    4545    mmap_attr_t   k_attr;       // attributes copy in kernel space
    4646    xptr_t        mapper_xp;
    47     error_t       error;
    4847    reg_t         save_sr;      // required to enable IRQs
    4948
     
    6261
    6362    // check user buffer (containing attributes) is mapped
    64     error = vmm_get_vseg( process , (intptr_t)attr , &vseg );
    65 
    66     if( error )
     63    if( vmm_get_vseg( process , (intptr_t)attr , &vseg ) )
    6764    {
    6865
     
    7673
    7774    // copy attributes from user space to kernel space
    78     hal_copy_from_uspace( local_cxy,
    79                           &k_attr,
     75    hal_copy_from_uspace( XPTR( local_cxy , &k_attr ),
    8076                          attr,
    8177                          sizeof(mmap_attr_t) );
     
    119115
    120116    // test mmap type : can be FILE / ANON / REMOTE
     117    // to define vseg_type & vseg_cxy
    121118
    122119    /////////////////////////////////////////////////////////// MAP_FILE
     
    126123#if (DEBUG_SYS_MMAP & 1)
    127124if ( DEBUG_SYS_MMAP < tm_start )
    128 printk("\n[%s] thread[%x,%x] map file : fdid %d / offset %d / %d bytes\n",
     125printk("\n[%s] thread[%x,%x] type file : fdid %d / offset %x / %x bytes\n",
    129126__FUNCTION__, process->pid, this->trdid, fdid, offset, length );
    130127#endif
    131128
    132             // FIXME: handle concurent delete of file by another thread closing it
     129            // FIXME: handle concurent delete of file by another thread
    133130
    134131                if( fdid >= CONFIG_PROCESS_FILE_MAX_NR )
     
    228225#if (DEBUG_SYS_MMAP & 1)
    229226if ( DEBUG_SYS_MMAP < tm_start )
    230 printk("\n[%s] thread[%x,%x] map anon / %d bytes / cluster %x\n",
     227printk("\n[%s] thread[%x,%x] type anon / %x bytes / cluster %x\n",
    231228__FUNCTION__, process->pid, this->trdid, length, vseg_cxy );
    232229#endif
     
    242239#if (DEBUG_SYS_MMAP & 1)
    243240if ( DEBUG_SYS_MMAP < tm_start )
    244 printk("\n[%s] thread[%x,%x] map remote / %d bytes / cluster %x\n",
     241printk("\n[%s] thread[%x,%x] type remote / %x bytes / target cluster %x\n",
    245242__FUNCTION__, process->pid, this->trdid, length, vseg_cxy );
    246243#endif
    247244 
    248         if( cluster_is_undefined( vseg_cxy ) )
     245        if( cluster_is_active( vseg_cxy ) == false )
    249246        {
    250247
     
    266263    process_t * ref_ptr = GET_PTR( ref_xp );
    267264
    268     // create the vseg in reference cluster
     265    // register vseg in reference VSL
    269266    if( local_cxy == ref_cxy )
    270267    {
     
    306303    }
    307304
    308     // copy vseg base address to user space
    309     hal_copy_to_uspace( local_cxy,
    310                         &vseg->min,
    311                         &attr->addr,
     305    // copy vseg base address to user space mmap_attr_t
     306    hal_copy_to_uspace( &attr->addr,
     307                        XPTR( ref_cxy , &vseg->min ),
    312308                        sizeof(intptr_t) );
    313309    hal_fence();
     
    324320#if DEBUG_SYS_MMAP
    325321if ( DEBUG_SYS_MMAP < tm_end )
    326 printk("\n[%s] thread[%x,%x] exit / %s / cxy %x / base %x / size %d / cycle %d\n",
     322printk("\n[%s] thread[%x,%x] exit / %s / cxy %x / base %x / size %x / cycle %d\n",
    327323__FUNCTION__, process->pid, this->trdid,
    328324vseg_type_str(vseg->type), vseg->cxy, vseg->min, length, (uint32_t)tm_end );
  • trunk/kernel/syscalls/sys_open.c

    r625 r637  
    7777
    7878    // copy pathname in kernel space
    79     hal_strcpy_from_uspace( kbuf , pathname , CONFIG_VFS_MAX_PATH_LENGTH );
     79    hal_strcpy_from_uspace( XPTR( local_cxy , kbuf ) , pathname , CONFIG_VFS_MAX_PATH_LENGTH );
    8080
    8181#if DEBUG_SYS_OPEN
  • trunk/kernel/syscalls/sys_opendir.c

    r635 r637  
    8585
    8686    // copy pathname in kernel space
    87     hal_strcpy_from_uspace( kbuf , pathname , CONFIG_VFS_MAX_PATH_LENGTH );
     87    hal_strcpy_from_uspace( XPTR( local_cxy , kbuf ),
     88                            pathname,
     89                            CONFIG_VFS_MAX_PATH_LENGTH );
    8890
    8991#if DEBUG_SYS_OPENDIR
     
    174176
    175177    // set ident value in user buffer
    176     hal_copy_to_uspace( local_cxy,
    177                         &ident,
    178                         dirp,
     178    hal_copy_to_uspace( dirp,
     179                        XPTR( local_cxy , &ident ),
    179180                        sizeof(intptr_t) );
    180181
  • trunk/kernel/syscalls/sys_place_fork.c

    r623 r637  
    4040    process_t * process = this->process;
    4141
     42#if (DEBUG_SYS_PLACE_FORK || CONFIG_INSTRUMENTATION_SYSCALLS)
     43uint64_t     tm_start = hal_get_cycles();
     44#endif
     45
     46#if DEBUG_SYS_PLACE_FORK
     47if( DEBUG_SYS_PLACE_FORK < tm_start )
     48printk("\n[%s] thread[%x,%x] enter / cxy %x / cycle %d\n",
     49__FUNCTION__, process->pid, this->trdid, cxy, (uint32_t)tm_start );
     50#endif
     51
    4252    // check cxy argument
    43     if( cluster_is_undefined( cxy ) )
     53    if( cluster_is_active( cxy ) == false )
    4454    {
    4555       
     
    5666    this->fork_cxy  = cxy;
    5767
     68#if (DEBUG_SYS_PLACE_FORK || CONFIG_INSTRUMENTATION_SYSCALLS)
     69uint64_t     tm_end = hal_get_cycles();
     70#endif
     71
     72#if DEBUG_SYS_PLACE_FORK
     73if( DEBUG_SYS_PLACE_FORK < tm_end )
     74printk("\n[%s] thread[%x,%x] exit / cycle %d\n",
     75__FUNCTION__ , process->pid, this->trdid, (uint32_t)tm_end );
     76#endif
     77
     78#if CONFIG_INSTRUMENTATION_SYSCALLS
     79hal_atomic_add( &syscalls_cumul_cost[SYS_PLACE_FORK] , tm_end - tm_start );
     80hal_atomic_add( &syscalls_occurences[SYS_PLACE_FORK] , 1 );
     81#endif
     82
    5883        return 0;
    5984
  • trunk/kernel/syscalls/sys_readdir.c

    r635 r637  
    112112
    113113    // copy dirent pointer to user buffer
    114     hal_copy_to_uspace( local_cxy,
    115                         &direntp,
    116                         buffer,
     114    hal_copy_to_uspace( buffer,
     115                        XPTR( local_cxy , &direntp ),
    117116                        sizeof(void *) );
    118117
  • trunk/kernel/syscalls/sys_rename.c

    r613 r637  
    22 * sys_rename.c - Rename a file or a directory.
    33 *
    4  * Author        Alain Greiner (2016,2017,2018)
     4 * Author        Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
     
    7575
    7676    // copy old name an new name in kernel space
    77     hal_strcpy_from_uspace( k_old , old , CONFIG_VFS_MAX_PATH_LENGTH );
    78     hal_strcpy_from_uspace( k_new , new , CONFIG_VFS_MAX_PATH_LENGTH );
     77    hal_strcpy_from_uspace( XPTR( local_cxy , k_old ) , old , CONFIG_VFS_MAX_PATH_LENGTH );
     78    hal_strcpy_from_uspace( XPTR( local_cxy , k_new ) , new , CONFIG_VFS_MAX_PATH_LENGTH );
    7979
    8080#if DEBUG_SYS_RENAME
  • trunk/kernel/syscalls/sys_rmdir.c

    r604 r637  
    22 * sys_rmdir.c - Remove a directory from file system.
    33 *
    4  * Author    Alain Greiner (2016,2017)
     4 * Author    Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) 2015 UPMC Sorbonne Universites
     
    4242        process_t * process = this->process;
    4343
     44#if (DEBUG_SYS_RMDIR || CONFIG_INSTRUMENTATION_SYSCALLS)
     45uint64_t     tm_start = hal_get_cycles();
     46#endif
     47
    4448    // check pathname length
    4549    if( hal_strlen_from_uspace( pathname ) >= CONFIG_VFS_MAX_PATH_LENGTH )
     
    5458
    5559    // copy pathname in kernel space
    56     hal_strcpy_from_uspace( kbuf , pathname , CONFIG_VFS_MAX_PATH_LENGTH );
     60    hal_strcpy_from_uspace( XPTR( local_cxy , kbuf ),
     61                            pathname,
     62                            CONFIG_VFS_MAX_PATH_LENGTH );
    5763
    5864    // get cluster and local pointer on reference process
  • trunk/kernel/syscalls/sys_sem.c

    r635 r637  
    5858    process_t      * process = this->process;
    5959
     60#if (DEBUG_SYS_SEM || CONFIG_INSTRUMENTATION_SYSCALLS)
     61uint64_t     tm_start = hal_get_cycles();
     62#endif
     63
    6064#if DEBUG_SYS_SEM
    61 uint64_t    tm_start;
    62 uint64_t    tm_end;
    63 tm_start = hal_get_cycles();
    6465if( DEBUG_SYS_SEM < tm_start )
    6566printk("\n[DBG] %s : thread %x in process %x enter for %s / cycle %d\n",
     
    137138 
    138139            // return value to user
    139             hal_copy_to_uspace( local_cxy,
    140                                 &current,
    141                                 current_value,
     140            hal_copy_to_uspace( current_value,
     141                                XPTR( local_cxy , &current ),
    142142                                sizeof(uint32_t) );
    143143        }
     
    224224    hal_fence();
    225225
     226#if (DEBUG_SYS_SEM || CONFIG_INSTRUMENTATION_SYSCALLS)
     227uint64_t     tm_end = hal_get_cycles();
     228#endif
     229
    226230#if DEBUG_SYS_SEM
    227 tm_end = hal_get_cycles();
    228231if( DEBUG_SYS_SEM < tm_end )
    229232printk("\n[DBG] %s : thread %x in process %x exit for %s / cost = %d / cycle %d\n",
     
    232235#endif
    233236
     237#if CONFIG_INSTRUMENTATION_SYSCALLS
     238hal_atomic_add( &syscalls_cumul_cost[SYS_SEM] , tm_end - tm_start );
     239hal_atomic_add( &syscalls_occurences[SYS_SEM] , 1 );
     240#endif
     241
    234242    return 0;
    235243
  • trunk/kernel/syscalls/sys_stat.c

    r635 r637  
    8080
    8181    // copy pathname in kernel space
    82     hal_strcpy_from_uspace( kbuf , pathname , CONFIG_VFS_MAX_PATH_LENGTH );
     82    hal_strcpy_from_uspace( XPTR( local_cxy , kbuf ),
     83                            pathname,
     84                            CONFIG_VFS_MAX_PATH_LENGTH );
    8385
    8486#if DEBUG_SYS_STAT
     
    121123   
    122124    // copy k_stat to u_stat
    123     hal_copy_to_uspace( local_cxy,
    124                         &k_stat,
    125                         u_stat,
     125    hal_copy_to_uspace( u_stat,
     126                        XPTR( local_cxy , &k_stat ),
    126127                        sizeof(struct stat) );
    127128
  • trunk/kernel/syscalls/sys_thread_create.c

    r635 r637  
    6666
    6767#if DEBUG_SYS_THREAD_CREATE
    68 tm_start = hal_get_cycles();
    6968if( DEBUG_SYS_THREAD_CREATE < tm_start )
    7069printk("\n[%s] thread[%x,%x] enter / cycle %d\n",
     
    7372
    7473    // check trdid buffer in user space
    75     error = vmm_get_vseg( process , (intptr_t)trdid_ptr , &vseg );
    76 
    77     if ( error )
     74    if( vmm_get_vseg( process , (intptr_t)trdid_ptr , &vseg ) )
    7875    {
    7976
     
    8986    if( user_attr != NULL )
    9087    {
    91             error = vmm_get_vseg( process , (intptr_t)user_attr , &vseg );
    92 
    93             if( error )
     88            if( vmm_get_vseg( process , (intptr_t)user_attr , &vseg ) )
    9489            {
    9590
     
    10297            }
    10398       
    104             hal_copy_from_uspace( local_cxy,
    105                               &kern_attr,
     99            hal_copy_from_uspace( XPTR( local_cxy , &kern_attr ),
    106100                              user_attr,
    107101                              sizeof(pthread_attr_t) );
     
    109103
    110104        // check start_func in user space
    111         error = vmm_get_vseg( process , (intptr_t)start_func , &vseg );
    112 
    113     if( error )
     105        if( vmm_get_vseg( process , (intptr_t)start_func , &vseg ) )
    114106    {
    115107
     
    125117        if( start_args != NULL )
    126118    {
    127         error = vmm_get_vseg( process , (intptr_t)start_args , &vseg );
    128 
    129             if( error )
     119        if( vmm_get_vseg( process , (intptr_t)start_args , &vseg ) )
    130120            {
    131121
     
    145135            if( kern_attr.attributes & PT_ATTR_CLUSTER_DEFINED )
    146136            {
    147                     if( cluster_is_undefined( kern_attr.cxy ) )
     137                    if( cluster_is_active( kern_attr.cxy ) == false )
    148138                    {
    149139
     
    159149        else
    160150        {
    161             child_cxy = dqdt_get_cluster_for_process();
     151            child_cxy = dqdt_get_cluster_for_thread( LOCAL_CLUSTER->dqdt_root_xp );
    162152        }
    163153        }
     
    165155        {
    166156        kern_attr.attributes = PT_ATTR_DETACH | PT_ATTR_CLUSTER_DEFINED;
    167         child_cxy           = dqdt_get_cluster_for_process();
     157        child_cxy = dqdt_get_cluster_for_thread( LOCAL_CLUSTER->dqdt_root_xp );
    168158        }
    169159
     
    209199        // returns trdid to user space
    210200        trdid = hal_remote_l32( XPTR( child_cxy , &child_ptr->trdid ) );
    211         hal_copy_to_uspace( local_cxy,
    212                         &trdid,
    213                         trdid_ptr,
     201        hal_copy_to_uspace( trdid_ptr,
     202                        XPTR( local_cxy , &trdid ),
    214203                        sizeof(pthread_t) );
    215204
  • trunk/kernel/syscalls/sys_thread_detach.c

    r566 r637  
    22 * sys_thread_detach.c - detach a joinable thread
    33 *
    4  * Authors   Alain Greiner (2016,2017)
     4 * Authors   Alain Greiner (2016,2017,2018,2019)
    55 *
    6  * Copyright (c) 2011,2012 UPMC Sorbonne Universites
     6 * Copyright (c) UPMC Sorbonne Universites
    77 *
    88 * This file is part of ALMOS-MKH.
     
    4848
    4949    // check trdid argument
    50         if( (target_ltid >= CONFIG_THREADS_MAX_PER_CLUSTER) || cluster_is_undefined( target_cxy ) ) 
     50        if( (target_ltid >= CONFIG_THREADS_MAX_PER_CLUSTER) ||
     51        (cluster_is_active( target_cxy ) == false) ) 
    5152        {
    5253        printk("\n[ERROR] in %s : illegal trdid argument\n", __FUNCTION__ );
  • trunk/kernel/syscalls/sys_thread_join.c

    r633 r637  
    22 * sys_thread_join.c - passive wait on the end of a given thread.
    33 *
    4  * Authors    Alain Greiner (2016,2017)
    5  *
    6  * Copyright (c) 2011,2012 UPMC Sorbonne Universites
     4 * Authors    Alain Greiner (2016,2017,2018,2019)
     5 *
     6 * Copyright (c) UPMC Sorbonne Universites
    77 *
    88 * This file is part of ALMOS-MKH.
     
    7272
    7373    // check trdid argument
    74         if( (target_ltid >= CONFIG_THREADS_MAX_PER_CLUSTER) || cluster_is_undefined(target_cxy) )
     74        if( (target_ltid >= CONFIG_THREADS_MAX_PER_CLUSTER) ||
     75        (cluster_is_active(target_cxy) == false) )
    7576        {
    7677
  • trunk/kernel/syscalls/sys_thread_wakeup.c

    r566 r637  
    11/*
    2  * sys_thread_wakeup.c - wakeup all indicated threads
     2 * sys_thread_wakeup.c - wakeup indicated thread
    33 *
    4  * Author    Alain Greiner (2016,2017)
     4 * Author    Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
     
    3636    process_t * process = this->process;
    3737
     38#if (DEBUG_SYS_THREAD_WAKEUP || CONFIG_INSTRUMENTATION_SYSCALLS)
     39uint64_t     tm_start = hal_get_cycles();
     40#endif
     41
    3842#if DEBUG_SYS_THREAD_WAKEUP
    39 uint64_t     tm_start;
    40 uint64_t     tm_end;
    41 tm_start = hal_get_cycles();
    4243if( DEBUG_SYS_THREAD_WAKEUP < tm_start )
    43 printk("\n[DBG] %s : thread %x in process enter to activate thread %x / cycle %d\n",
     44printk("\n[%s] thread %x in process enter to activate thread %x / cycle %d\n",
    4445__FUNCTION__, this->trdid, process->pid, trdid, (uint32_t)tm_start );
    4546#endif
     
    5051
    5152    // check trdid argument
    52         if( (target_ltid >= CONFIG_THREADS_MAX_PER_CLUSTER) || cluster_is_undefined( target_cxy ) ) 
     53        if( (target_ltid >= CONFIG_THREADS_MAX_PER_CLUSTER) ||
     54        (cluster_is_active( target_cxy ) == false) ) 
    5355        {
    5456
     
    7880    thread_unblock( thread_xp , THREAD_BLOCKED_GLOBAL );
    7981
     82#if (DEBUG_SYS_THREAD_WAKEUP || CONFIG_INSTRUMENTATION_SYSCALLS)
     83uint64_t     tm_end = hal_get_cycles();
     84#endif
     85
     86
    8087#if DEBUG_SYS_THREAD_WAKEUP
    81 tm_end = hal_get_cycles();
    8288if( DEBUG_SYS_THREAD_WAKEUP < tm_end )
    83 printk("\n[DBG] %s : thread %x in process %x exit / thread %x activated / cycle %d\n",
     89printk("\n[%s] thread %x in process %x exit / thread %x activated / cycle %d\n",
    8490__FUNCTION__ , this->trdid, process->pid, trdid, (uint32_t)tm_end );
     91#endif
     92
     93#if CONFIG_INSTRUMENTATION_SYSCALLS
     94hal_atomic_add( &syscalls_cumul_cost[SYS_THREAD_WAKEUP] , tm_end - tm_start );
     95hal_atomic_add( &syscalls_occurences[SYS_THREAD_WAKEUP] , 1 );
    8596#endif
    8697
  • trunk/kernel/syscalls/sys_timeofday.c

    r635 r637  
    5050        process_t *    process = this->process;
    5151
     52#if (DEBUG_SYS_TIMEOFDAY || CONFIG_INSTRUMENTATION_SYSCALLS)
     53uint64_t     tm_start = hal_get_cycles();
     54#endif
     55
     56#if DEBUG_SYS_TIMEOFDAY
     57if( DEBUG_SYS_TIMEOFDAY < tm_start )
     58printk("\n[%s] thread[%x,%x] enter / cycle %d\n",
     59__FUNCTION__, process->pid, this->trdid, (uint32_t)tm_start );
     60#endif
     61 
    5262    // check tz (non supported / must be null)
    5363    if( tz )
     
    8292
    8393    // copy values to user space
    84         hal_copy_to_uspace( local_cxy,
    85                         &k_tv,
    86                         tv,
     94        hal_copy_to_uspace( tv,
     95                        XPTR( local_cxy , &k_tv ),
    8796                        sizeof(struct timeval) );
    8897
    8998    hal_fence();
    9099
     100#if (DEBUG_SYS_TIMEOFDAY || CONFIG_INSTRUMENTATION_SYSCALLS)
     101uint64_t     tm_end = hal_get_cycles();
     102#endif
     103
     104#if DEBUG_SYS_TIMEOFDAY
     105if( DEBUG_SYS_TIMEOFDAY < tm_end )
     106printk("\n[%s] thread[%x,%x] exit / cycle %d\n",
     107__FUNCTION__, process->pid, this->trdid, (uint32_t)tm_end );
     108#endif
     109 
     110#if CONFIG_INSTRUMENTATION_SYSCALLS
     111hal_atomic_add( &syscalls_cumul_cost[SYS_TIMEOFDAY] , tm_end - tm_start );
     112hal_atomic_add( &syscalls_occurences[SYS_TIMEOFDAY] , 1 );
     113#endif
     114
    91115        return 0;
    92116
  • trunk/kernel/syscalls/sys_trace.c

    r566 r637  
    22 * sys_trace.c - activate / desactivate the context switches trace for a given core
    33 *
    4  * Author    Alain Greiner (c) (2016,2017,2018)
     4 * Author    Alain Greiner (c) (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
     
    4040    process_t * process = this->process;
    4141
     42#if (DEBUG_SYS_TRACE || CONFIG_INSTRUMENTATION_SYSCALLS)
     43uint64_t     tm_start = hal_get_cycles();
     44#endif
     45
    4246#if DEBUG_SYS_TRACE
    43 uint64_t    tm_start;
    44 uint64_t    tm_end;
    45 tm_start = hal_get_cycles();
    4647if( DEBUG_SYS_TRACE < tm_start )
    47 printk("\n[DBG] %s : thread %d enter / process %x / cycle = %d\n",
    48 __FUNCTION__, this, this->process->pid, (uint32_t)tm_start );
     48printk("\n[%s] thread[%x,%x] enters / cycle = %d\n",
     49__FUNCTION__, this->process->pid, this->trdid, (uint32_t)tm_start );
    4950#endif
    5051
    5152    // check cluster identifier
    52     if( cluster_is_undefined( cxy ) )
     53    if( cluster_is_active( cxy ) == false )
    5354    {
    5455
     
    8586    hal_fence();
    8687
    87 #if DEBUG_SYS_TRACE
    88 tm_end = hal_get_cycles();
    89 if( DEBUG_SYS_TRACE < tm_end )
    90 printk("\n[DBG] %s : thread %x exit / process %x / cost = %d / cycle %d\n",
    91 __FUNCTION__, this, this->process->pid, (uint32_t)(tm_end - tm_start) , (uint32_t)tm_end );
     88#if (DEBUG_SYS_TRACE || CONFIG_INSTRUMENTATION_SYSCALLS)
     89uint64_t     tm_end = hal_get_cycles();
    9290#endif
    9391
     92#if DEBUG_SYS_TRACE
     93if( DEBUG_SYS_TRACE < tm_end )
     94printk("\n[%s] thread[%x,%x] exit / cycle %d\n",
     95__FUNCTION__, this->process->pid, this->trdid, (uint32_t)tm_end );
     96#endif
     97
     98#if CONFIG_INSTRUMENTATION_SYSCALLS
     99hal_atomic_add( &syscalls_cumul_cost[SYS_TRACE] , tm_end - tm_start );
     100hal_atomic_add( &syscalls_occurences[SYS_TRACE] , 1 );
     101#endif
    94102    return 0;
    95103
  • trunk/kernel/syscalls/sys_unlink.c

    r610 r637  
    22 * sys_unlink.c - unlink a file or directorya from VFS
    33 *
    4  * Author     Alain Greiner (2016,2017,2018)
     4 * Author     Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c)  UPMC Sorbonne Universites
     
    6060
    6161    // copy pathname in kernel space
    62     hal_strcpy_from_uspace( kbuf , pathname , CONFIG_VFS_MAX_PATH_LENGTH );
     62    hal_strcpy_from_uspace( XPTR( local_cxy , kbuf ),
     63                            pathname,
     64                            CONFIG_VFS_MAX_PATH_LENGTH );
    6365
    6466#if DEBUG_SYS_UNLINK
  • trunk/kernel/syscalls/sys_wait.c

    r635 r637  
    5353    pid_t       pid     = process->pid;
    5454
     55
    5556#if DEBUG_SYS_WAIT
    56 uint64_t    cycle = hal_get_cycles();
     57uint64_t cycle = hal_get_cycles();
    5758if( DEBUG_SYS_WAIT < cycle )
    5859printk("\n[%s] thread[%x,%x] enter / cycle %d\n",
     
    153154#endif
    154155                 // return child termination state  to parent process
    155                  hal_copy_to_uspace( local_cxy,
    156                                      &child_state,
    157                                      status,
     156                 hal_copy_to_uspace( status,
     157                                     XPTR( local_cxy , &child_state ),
    158158                                     sizeof(int) );
    159159                 return child_pid;
     
    192192
    193193    // never executed
    194         return -1;
     194        return 0;
    195195
    196196}  // end sys_wait()
  • trunk/kernel/syscalls/syscalls.h

    r626 r637  
    210210/******************************************************************************************
    211211 * [13] This function map physical memory (or a file) in the calling thread virtual space.
    212  * The <attr> argument is a pointer on a structure for arguments (see shared_syscalls.h).
     212 * The <attr> argument is a pointer on a structure for arguments (see shared_mman.h).
    213213 * The user defined virtual address (MAP_FIXED flag) is not supported.
    214214 * TODO : the access rights checking is not implemented yet [AG]
     
    560560
    561561/******************************************************************************************
    562  * [41] This function implements the non-standard get_core() syscall.
     562 * [41] This function implements the non-standard get_core_id() syscall.
    563563 * It returns in <cxy> and <lid> the calling core cluster and local index.
    564564 ******************************************************************************************
     
    567567 * @ return 0 if success / return -1 if illegal arguments
    568568 *****************************************************************************************/
    569 int sys_get_core( uint32_t * cxy,
    570                   uint32_t * lid );
     569int sys_get_core_id( uint32_t * cxy,
     570                     uint32_t * lid );
    571571
    572572/******************************************************************************************
     
    696696int sys_fsync( uint32_t file_id );
    697697
     698/******************************************************************************************
     699 * [53] This function implements the non-standard "get_best_core" syscall.
     700 * It selects, in a macro-cluster specified by the <base_cxy> and <level> arguments,
     701 * the core that has the lowest load.
     702 * When an active core has been found in the target macro-cluster, it writes into the
     703 * <cxy> and <lid> buffers the cluster identifier and the core local index, and return 0.
     704 * It returns -1 in case of illegal arguments (level / cxy / lid).
     705 * It returns +1 if there is no active core in specified macro-cluster.
     706 ******************************************************************************************
     707 * @ base_cxy : [in]  any cluster identifier in macro-cluster.
     708 * @ level    : [in]  macro-cluster level in [1,2,3,4,5].
     709 * @ cxy      : [out] selected core cluster identifier.
     710 * @ lid      : [out] selected core local index in cluster.
     711 * @ return 0 if success / -1 if illegal arguments / +1 if no core in macro-clusters.
     712 *****************************************************************************************/
     713int sys_get_best_core( uint32_t   base_cxy,
     714                       uint32_t   level,
     715                       uint32_t * cxy,
     716                       uint32_t * lid );
     717
     718/******************************************************************************************
     719 * [54] This function implements the non-standard "get_nb_cores" syscall.
     720 * It writes in the <ncores> buffer the number of cores in the target cluster <cxy>.
     721 ******************************************************************************************
     722 * @ cxy      : [in]  target cluster identifier.
     723 * @ ncores   : [out] number of cores / 0 if cluster cxy undefined in architecture.
     724 * @ return 0 if success / return -1 if illegal "ncores" arguments.
     725 *****************************************************************************************/
     726int sys_get_nb_cores( uint32_t   cxy,
     727                      uint32_t * ncores );
     728
    698729#endif  // _SYSCALLS_H_
  • trunk/libs/libalmosmkh/almosmkh.c

    r626 r637  
    22 * almosmkh.c - User level ALMOS-MKH specific library implementation.
    33 *
    4  * Author     Alain Greiner (2016,2017,2018)
     4 * Author     Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
     
    2424#include <almosmkh.h>
    2525#include <hal_user.h>
     26#include <hal_macros.h>
    2627#include <hal_shared_types.h>
    2728#include <syscalls_numbers.h>
     
    3233#include <mman.h>
    3334
    34 #define  MALLOC_DEBUG    0
     35#define  DEBUG_REMOTE_MALLOC     0
     36#define  DEBUG_PTHREAD_PARALLEL  1
    3537 
    36 /////////////     Non standard system calls    /////////////////////////////////
     38//////////////////////////////////////////////////////////////////////////////////////
     39/////////////     Non standard system calls    ///////////////////////////////////////
     40//////////////////////////////////////////////////////////////////////////////////////
    3741
    3842//////////////////////////
     
    6367}
    6468
    65 /////////////////////////////////
    66 int get_core( unsigned int * cxy,
    67               unsigned int * lid )
    68 {
    69     return hal_user_syscall( SYS_GET_CORE,
     69////////////////////////////////////
     70int get_core_id( unsigned int * cxy,
     71                 unsigned int * lid )
     72{
     73    return hal_user_syscall( SYS_GET_CORE_ID,
    7074                             (reg_t)cxy,
    7175                             (reg_t)lid, 0, 0 );
     76}
     77
     78/////////////////////////////////////
     79int get_nb_cores( unsigned int   cxy,
     80                  unsigned int * ncores )
     81{
     82    return hal_user_syscall( SYS_GET_NB_CORES,
     83                             (reg_t)cxy,
     84                             (reg_t)ncores, 0, 0 );
     85}
     86
     87///////////////////////////////////////////
     88int get_best_core( unsigned int   base_cxy,
     89                   unsigned int   level,
     90                   unsigned int * cxy,
     91                   unsigned int * lid )
     92{
     93    return hal_user_syscall( SYS_GET_BEST_CORE,
     94                             (reg_t)base_cxy,
     95                             (reg_t)level,
     96                             (reg_t)cxy,
     97                             (reg_t)lid );
    7298}
    7399
     
    250276}  // end get_string()
    251277
    252 
    253 ///////////////    non standard debug functions    //////////////////////////
     278//////////////////////////////////////////////////////////////////////////////////////
     279///////////////    non standard debug functions    ///////////////////////////////////
     280//////////////////////////////////////////////////////////////////////////////////////
    254281
    255282////////////////////////////////////
     
    496523
    497524
    498 ///////////////    non standard malloc functions    //////////////////////////
     525/////////////////////////////////////////////////////////////////////////////////////////
     526///////////////    non standard remote_malloc    ////////////////////////////////////////
     527/////////////////////////////////////////////////////////////////////////////////////////
    499528
    500529/////////////////////////////////////////////////////////////////////////////////////////
    501530// Global variable defining the allocator array (one per cluster)
    502531// This array (about 16 Kbytes ) will be stored in the data segment
    503 // of any application linked with this malloc libray.
     532// of any application linked with this libray.
    504533/////////////////////////////////////////////////////////////////////////////////////////
    505534
     
    546575////////////////////////////////////////////////////////////////////////////////////////////
    547576
    548 #if MALLOC_DEBUG
     577#if DEBUG_REMOTE_MALLOC
    549578static void display_free_array( unsigned int cxy )
    550579{
     
    594623    unsigned int   iter;             // iterator
    595624
    596 #if MALLOC_DEBUG
    597 printf("\n[MALLOC] %s : enter for store[%x] / size = %x\n",
    598 __FUNCTION__, cxy, store_size );
     625#if DEBUG_REMOTE_MALLOC
     626unsigned int core_cxy;
     627unsigned int core_lid;
     628get_core_id( &core_cxy , &core_lid );
     629printf("\n[%s] core[%x,%d] enter for store[%x] / size = %x\n",
     630__FUNCTION__, core_cxy, core_lid, cxy, store_size );
    599631#endif
    600632
     
    635667    }
    636668
    637 #if MALLOC_DEBUG
    638 printf("\n[MALLOC] %s : mmap done for store[%x] / base = %x\n",
    639 __FUNCTION__, cxy, store_base );
     669#if DEBUG_REMOTE_MALLOC
     670printf("\n[%s] core[%x,%d] created vseg %x for store[%x]\n",
     671__FUNCTION__, core_cxy, core_lid, store_base, cxy );
    640672#endif
    641673
     
    656688    }
    657689
    658     // DEPRECATED: we don't reset the alloc_base array
    659     // because we don't want to allocate the physical memory
    660     // when the heap is created  [AG]
    661     // memset( (void *)alloc_base , 0 , alloc_size );
    662  
    663690    // split the store into various sizes blocks,
    664691    // initializes the free[] array and NEXT pointers
     
    690717
    691718
    692 #if MALLOC_DEBUG
    693 printf("\n[MALLOC] %s : completes store[%x] initialisation\n",
    694 __FUNCTION__, cxy );
    695 
     719#if DEBUG_REMOTE_MALLOC
     720printf("\n[%s] core[%x,%d] completed store[%x] initialisation\n",
     721__FUNCTION__, core_cxy, core_lid, cxy );
     722#endif
     723
     724#if (DEBUG_REMOTE_MALLOC & 1)
    696725display_free_array( cxy );
    697726#endif
     
    762791    int error;
    763792
    764 #if MALLOC_DEBUG
    765 printf("\n[MALLOC] %s : enter for size = %x / cxy = %x\n",
    766 __FUNCTION__ , size , cxy );
     793#if DEBUG_REMOTE_MALLOC
     794unsigned int core_cxy;
     795unsigned int core_lid;
     796get_core_id( &core_cxy , &core_lid );
     797printf("\n[%s] core[%x,%d] enter for size = %x / target_cxy = %x\n",
     798__FUNCTION__ , core_cxy, core_lid, size , cxy );
    767799#endif
    768800
     
    828860    unsigned char * ptr    = (unsigned char*)(store[cxy].alloc_base + offset);
    829861
    830     // DEPRECATED : we cannot check the alloc[] array,
    831     // because it has not been initialised by store_init,
    832     // to avoid physical memory allocation at heap creation [AG]
    833     // if ( *ptr != 0 )
    834     // {
    835     //    pthread_mutex_unlock( &store[cxy].mutex );
    836     //    printf("\n[PANIC] in %s : allocate an already allocated block...\n",
    837     //    __FUNCTION__ );
    838     //    return NULL;
    839     // }
    840 
    841862    // update alloc_array
    842863    *ptr = requested_index;
     
    845866    pthread_mutex_unlock( &store[cxy].mutex );
    846867 
    847 #if MALLOC_DEBUG
    848 printf("\n[MALLOC] %s : exit / base = %x / size = %x / from store[%x]\n",
    849 __FUNCTION__, base , size , cxy );
     868#if DEBUG_REMOTE_MALLOC
     869printf("\n[%s] core[%x,%d] exit / base = %x / size = %x / from store[%x]\n",
     870__FUNCTION__, core_cxy, core_lid, base , size , cxy );
    850871#endif
    851872
     
    853874
    854875} // end remote_malloc()
    855 
    856 
    857876
    858877//////////////////////////////////////////
     
    920939
    921940    return new_ptr;
    922 }
     941
     942}  // end remote_realloc()
     943
    923944
    924945//////////////////////////////////////////////////////
     
    9911012{
    9921013
    993 #if MALLOC_DEBUG
     1014#if DEBUG_REMOTE_MALLOC
    9941015printf("\n[MALLOC] %s : enter for block = %x / cxy = %x\n",
    9951016__FUNCTION__, ptr, cxy );
     
    10521073    pthread_mutex_unlock( &store[cxy].mutex );
    10531074
    1054 #if MALLOC_DEBUG
     1075#if DEBUG_REMOTE_MALLOC
    10551076printf("\n[MALLOC] %s : conmpletes for block = %x / cxy = %x\n",
    10561077__FUNCTION__, ptr, cxy );
     
    10581079
    10591080} // end remote_free()
     1081
     1082/////////////////////////////////////////////////////////////////////////////////////////
     1083///////////////    non standard pthread_parallel_create    //////////////////////////////
     1084/////////////////////////////////////////////////////////////////////////////////////////
     1085
     1086#define X_MAX                   16              // max number of clusters in a row
     1087#define Y_MAX                   16              // max number of clusters in a column
     1088#define CLUSTERS_MAX            X_MAX * Y_MAX
     1089#define LEVEL_MAX               5
     1090#define CORES_MAX               4               // max number of cores per cluster
     1091
     1092typedef struct build_args_s           
     1093{
     1094    unsigned char       cxy;                    // this thread cluster identifier
     1095    unsigned char       level;                  // this thread level in quad-tree
     1096    unsigned char       parent_cxy;             // parent thread cluster identifier
     1097    unsigned char       root_level;             // quad-tree root level
     1098    void              * work_func;              // pointer on work function pointer
     1099    void              * work_args_array;        // pointer on 2D array of pointers
     1100    pthread_barrier_t * parent_barriers_array;  // pointer on 1D array of barriers
     1101    unsigned int        error;                  // return value : 0 if success
     1102}
     1103build_args_t;
     1104
     1105/////////////////////////////////////////////////////////////////////////////////////////
     1106//      Global variables used for inter-thread communications
     1107/////////////////////////////////////////////////////////////////////////////////////////
     1108
     1109pthread_attr_t    build_attr   [CLUSTERS_MAX][LEVEL_MAX];   // POSIX thread attributes
     1110
     1111build_args_t      build_args   [CLUSTERS_MAX][LEVEL_MAX];   // build function arguments
     1112
     1113pthread_barrier_t build_barrier[CLUSTERS_MAX][LEVEL_MAX];   // parent/child synchro
     1114
     1115pthread_attr_t    work_attr    [CLUSTERS_MAX][CORES_MAX];    // POSIX thread attributes
     1116
     1117//////////////////////////////////////////////////////////
     1118static void pthread_recursive_build( build_args_t * args )
     1119{
     1120    unsigned int   trdid;         // unused (required by pthread_create()
     1121
     1122    // get arguments
     1123    unsigned int        cxy                   = args->cxy;
     1124    unsigned int        level                 = args->level;
     1125    unsigned int        parent_cxy            = args->parent_cxy;
     1126    unsigned int        root_level            = args->root_level;
     1127    void              * work_func             = args->work_func;
     1128    void              * work_args_array       = args->work_args_array;
     1129    pthread_barrier_t * parent_barriers_array = args->parent_barriers_array;
     1130
     1131    // set error default value
     1132    build_args[cxy][level].error = 0;
     1133
     1134    ///////////////////////////////////////////////////////////
     1135    if( level == 0 )             // children are "work" threads
     1136    {
     1137        unsigned int   lid;           // core local index
     1138        unsigned int   ncores;        // number of cores in a cluster
     1139
     1140        // get number of cores per cluster
     1141        get_nb_cores( cxy , &ncores );
     1142
     1143        // kill process if no active core in cluster
     1144        // TODO this "if" should be replaced by an "assert" [AG]
     1145        if( ncores == 0 )
     1146        {
     1147            printf("\n[PANIC] in %s : no active core in cluster %x\n",
     1148            __FUNCTION__ , cxy );
     1149
     1150            // report error to parent
     1151            build_args[parent_cxy][level+1].error = 1;
     1152
     1153            // kill process
     1154            exit( EXIT_FAILURE );
     1155        }
     1156
     1157        // initialize the parent_barrier
     1158        if( pthread_barrier_init( &parent_barriers_array[cxy] , NULL , ncores + 1 ) )
     1159        {
     1160            printf("\n[ERROR] in %s : cannot initialise barrier for build thread[%x][%d]\n",
     1161            __FUNCTION__ , cxy , level );
     1162
     1163            // report error to parent
     1164            build_args[parent_cxy][level+1].error = 1;
     1165        }
     1166
     1167#if DEBUG_PTHREAD_PARALLEL
     1168printf("\n[%s] <build> thread[%x][%d] created barrier / %d children\n",
     1169__FUNCTION__, cxy, level, ncores + 1 );
     1170#endif
     1171        // create (ncores) "work" threads
     1172        for ( lid = 0 ; lid < ncores ; lid++ )
     1173        {
     1174            // set attributes for thread[cxy][lid]
     1175            work_attr[cxy][lid].attributes = PT_ATTR_DETACH |
     1176                                             PT_ATTR_CLUSTER_DEFINED |
     1177                                             PT_ATTR_CORE_DEFINED;
     1178            work_attr[cxy][lid].cxy        = cxy;
     1179            work_attr[cxy][lid].lid        = lid;
     1180
     1181            // compute pointer on thread[cxy][lid] arguments
     1182            void * work_args = *((void **)work_args_array + (cxy * CORES_MAX) + lid);
     1183
     1184            // create thread
     1185            if ( pthread_create( &trdid,                  // unused
     1186                                 &work_attr[cxy][lid],
     1187                                 work_func,
     1188                                 work_args ) )
     1189            {
     1190                printf("\n[ERROR] in %s : cannot create work thread[%x,%x]\n",
     1191                __FUNCTION__ , cxy , lid );
     1192
     1193                // report error to parent
     1194                build_args[parent_cxy][level+1].error = 1;
     1195            }
     1196
     1197#if DEBUG_PTHREAD_PARALLEL
     1198printf("\n[%s] <build> thread[%x][%d] created <work> thread[%x][%d]\n",
     1199__FUNCTION__, cxy, level, cxy, lid );
     1200#endif
     1201        }
     1202
     1203        // wait on barrier until "work" children threads completed
     1204        if( pthread_barrier_wait( &parent_barriers_array[cxy] ) )
     1205        {
     1206            printf("\n[ERROR] in %s / first barrier for <build> thread[%x][%d]\n",
     1207            __FUNCTION__ , cxy , level );
     1208
     1209            // report error to parent
     1210            build_args[parent_cxy][level+1].error = 1;
     1211        }
     1212
     1213#if DEBUG_PTHREAD_PARALLEL
     1214printf("\n[%s] <build> thread[%x][%d] resume after children completion\n",
     1215__FUNCTION__, cxy, level );
     1216#endif
     1217
     1218    }  // end level == 0
     1219
     1220    ////////////////////////////////////////////////////////////
     1221    else                        // children are "build" threads
     1222    {
     1223        // the 4 children threads can be created in any core of each quarters
     1224        // of the parent macro-cluster
     1225
     1226        unsigned int parent_x;          // X coordinate of parent macro-cluster
     1227        unsigned int parent_y;          // Y coordinate of parent macro-cluster
     1228        unsigned int child_x;           // X coordinate of child macro-cluster
     1229        unsigned int child_y;           // Y coordinate of child macro-cluster
     1230        unsigned int child_cxy[2][2];   // selected cluster for child thread
     1231        unsigned int child_lid[2][2];   // selected core index for child thread
     1232        int          child_sts[2][2];   // -1 if error / 0 if success / +1 if not found
     1233        unsigned int x;                 // X loop index for children
     1234        unsigned int y;                 // Y loop index for children
     1235       
     1236        unsigned int nb_children = 0;
     1237
     1238        // get parent macro-cluster mask and half-size from level
     1239        unsigned int mask = (1 << level) - 1;
     1240        unsigned int half = (level > 0) ? (1 << (level - 1)) : 0;
     1241
     1242        // get parent macro-cluster coordinates
     1243        parent_x = HAL_X_FROM_CXY( cxy ) & ~mask;
     1244        parent_y = HAL_Y_FROM_CXY( cxy ) & ~mask;
     1245
     1246        // get child_cxy and child_lid for up to 4 children threads : 00 / 01 / 10 / 11
     1247        for (x = 0 ; x < 2 ; x++)
     1248        {
     1249            // compute child macro-cluster X coordinate
     1250            child_x = (x == 0) ? parent_x : (parent_x + half);
     1251
     1252            for (y = 0 ; y < 2 ; y++)
     1253            {
     1254                // compute child macro-cluster Y coordinate
     1255                child_y = (y == 0) ? parent_y : (parent_y + half);
     1256
     1257                // select the best core in macro-cluster
     1258                child_sts[x][y] = get_best_core( HAL_CXY_FROM_XY( child_x , child_y ),
     1259                                                 level-1,
     1260                                                 &child_cxy[x][y],
     1261                                                 &child_lid[x][y] );
     1262
     1263                if( child_sts[x][y] < 0 )  // failure => report error
     1264                {
     1265                    printf("\n[ERROR] in %s : illegal arguments for <build> thread[%x,%x]\n",
     1266                    __FUNCTION__ , cxy , level );
     1267
     1268                    // report error to parent
     1269                    build_args[parent_cxy][level+1].error = 1;
     1270                }
     1271                else if (child_sts[x][y] > 0 )  // macro-cluster undefined => does nothing
     1272                {
     1273                }
     1274                else                            // core found
     1275                {
     1276                    nb_children++;
     1277                }
     1278            }  // end for y
     1279        }  // end for x
     1280
     1281        // kill process if no active core in cluster
     1282        // TODO this "if" should be replaced by an "assert" [AG]
     1283        if( nb_children == 0 )
     1284        {
     1285            printf("\n[PANIC] in %s : no active core in macro cluster [%x,%d]\n",
     1286            __FUNCTION__ , cxy , level );
     1287
     1288            // report error to parent
     1289            build_args[parent_cxy][level+1].error = 1;
     1290
     1291            // kill process
     1292            exit( EXIT_FAILURE );
     1293        }
     1294
     1295        // initialize the barrier for (nb_children + 1)
     1296        if( pthread_barrier_init( &build_barrier[cxy][level], NULL , nb_children + 1 ) )
     1297        {
     1298            printf("\n[error] in %s : cannot initialise barrier for build thread[%x][%d]\n",
     1299            __FUNCTION__ , cxy , level );
     1300
     1301            // report error to parent
     1302            build_args[parent_cxy][level+1].error = 1;
     1303        }
     1304
     1305#if DEBUG_PTHREAD_PARALLEL
     1306printf("\n[%s] <build> thread[%x][%d] created barrier / %d children\n",
     1307__FUNCTION__, cxy, level, nb_children + 1 );
     1308#endif
     1309        // create 1 to 4 children threads
     1310        for (x = 0 ; x < 2 ; x++)
     1311        {
     1312            for (y = 0 ; y < 2 ; y++)
     1313            {
     1314                // thread is created only if macro-cluster is active
     1315                if( child_sts[x][y] == 0 )
     1316                {
     1317                    unsigned int tgt_cxy = child_cxy[x][y];
     1318                    unsigned int tgt_lid = child_lid[x][y];
     1319
     1320                    // set child thread attributes
     1321                    build_attr[tgt_cxy][level-1].attributes = PT_ATTR_DETACH |
     1322                                                              PT_ATTR_CLUSTER_DEFINED |
     1323                                                              PT_ATTR_CORE_DEFINED;
     1324                    build_attr[tgt_cxy][level-1].cxy        = tgt_cxy;
     1325                    build_attr[tgt_cxy][level-1].lid        = tgt_lid;
     1326
     1327                    // propagate build function arguments
     1328                    build_args[tgt_cxy][level-1].cxy                   = child_cxy[x][y];
     1329                    build_args[tgt_cxy][level-1].level                 = level-1;
     1330                    build_args[tgt_cxy][level-1].parent_cxy            = cxy;
     1331                    build_args[tgt_cxy][level-1].root_level            = root_level;
     1332                    build_args[tgt_cxy][level-1].work_func             = work_func;
     1333                    build_args[tgt_cxy][level-1].work_args_array       = work_args_array;
     1334                    build_args[tgt_cxy][level-1].parent_barriers_array = parent_barriers_array;
     1335                   
     1336                    // create thread
     1337                    if( pthread_create( &trdid,                         
     1338                                        &build_attr[tgt_cxy][level-1],   
     1339                                        &pthread_recursive_build,                         
     1340                                        &build_args[tgt_cxy][level-1] ) )
     1341                    {
     1342                        printf("\n[ERROR] in %s : cannot create build thread[%x][%d]\n",
     1343                        __FUNCTION__ , child_cxy , level -1 );
     1344
     1345                        // report error to parent
     1346                        build_args[parent_cxy][level+1].error = 1;
     1347                    }
     1348
     1349#if DEBUG_PTHREAD_PARALLEL
     1350printf("\n[%s] <build> thread[%x][%d] created <build> thread[%x][%d] on core[%x,%d]\n",
     1351__FUNCTION__, cxy, level, tgt_cxy, level - 1, tgt_cxy, tgt_lid );
     1352#endif
     1353                }  //end if sts[x][y]
     1354            }  // end for y
     1355        }  // end for x
     1356       
     1357        // wait on barrier until "build" children threads completed
     1358        if( pthread_barrier_wait( &build_barrier[cxy][level] ) )
     1359        {
     1360            printf("\n[ERROR] in %s / first barrier for <build> thread[%x][%d]\n",
     1361            __FUNCTION__ , cxy , level );
     1362
     1363            // report error to parent
     1364            build_args[parent_cxy][level+1].error = 1;
     1365        }
     1366
     1367#if DEBUG_PTHREAD_PARALLEL
     1368printf("\n[%s] <build> thread[%x][%d] resume after children completion\n",
     1369__FUNCTION__, cxy, level );
     1370#endif
     1371
     1372    }  // end level > 0
     1373
     1374    // report error to parent when required
     1375    if( build_args[cxy][level].error )
     1376    {
     1377        build_args[parent_cxy][level+1].error = 1;
     1378    }
     1379
     1380    // all <build> threads - but the root -
     1381    // signal completion to parent thread and exit
     1382    if( level < root_level )
     1383    {
     1384        if( pthread_barrier_wait( &build_barrier[parent_cxy][level+1] ) )
     1385        {
     1386            printf("\n[ERROR] in %s / second barrier for <build> thread[%x][%d]\n",
     1387            __FUNCTION__ , cxy , level );
     1388
     1389            // report error to parent
     1390            build_args[parent_cxy][level+1].error = 1;
     1391        }
     1392   
     1393#if DEBUG_PTHREAD_PARALLEL
     1394printf("\n[%s] <build> thread[%x][%d] exit\n",
     1395__FUNCTION__, cxy , level );
     1396#endif
     1397        // "build" thread exit
     1398        pthread_exit( NULL );
     1399    }
     1400}  // end pthread_recursive_build()
     1401
     1402///////////////////////////////////////////////////////
     1403int pthread_parallel_create( unsigned int   root_level,
     1404                             void         * work_func,
     1405                             void         * work_args_array,
     1406                             void         * parent_barriers_array )
     1407{
     1408    unsigned int   root_cxy;
     1409    unsigned int   root_lid;    // unused, but required by get_core_id()
     1410   
     1411#if DEBUG_PTHREAD_PARALLEL
     1412printf("\n[%s] enter / root_level %d / func %x / args %x / barriers %x\n",
     1413__FUNCTION__, root_level, work_func, work_args_array, parent_barriers_array );
     1414#endif
     1415
     1416    // get calling thread cluster
     1417    get_core_id( &root_cxy , &root_lid );
     1418
     1419    // set the build function arguments for the root <build> thread
     1420    build_args[root_cxy][root_level].cxy                   = root_cxy;
     1421    build_args[root_cxy][root_level].level                 = root_level;
     1422    build_args[root_cxy][root_level].root_level            = root_level;
     1423    build_args[root_cxy][root_level].work_func             = work_func;
     1424    build_args[root_cxy][root_level].work_args_array       = work_args_array;
     1425    build_args[root_cxy][root_level].parent_barriers_array = parent_barriers_array;
     1426   
     1427    // call the recursive build function
     1428    pthread_recursive_build( &build_args[root_cxy][root_level] );
     1429
     1430    // check error
     1431    if( build_args[root_cxy][root_level].error )
     1432    {
     1433        printf("\n[error] in  %s\n", __FUNCTION__ );
     1434        return -1;
     1435    }
     1436
     1437    return 0;
     1438
     1439}  // end pthread_parallel_create()
     1440
     1441
    10601442
    10611443// Local Variables:
  • trunk/libs/libalmosmkh/almosmkh.h

    r629 r637  
    22 * almosmkh.h - User level ALMOS-MKH specific library definition.
    33 *
    4  * Author     Alain Greiner (2016,2017,2018)
     4 * Author     Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
     
    7272
    7373/***************************************************************************************
    74  * This syscall returns the cluster an local index for the calling core.
     74 * This syscall returns the cluster identifier and the local index
     75 * for the calling core.
    7576 ***************************************************************************************
    7677 * @ cxy      : [out] cluster identifier.
     
    7879 * @ return always 0.
    7980 **************************************************************************************/
    80 int get_core( unsigned int * cxy,
    81               unsigned int * lid );
     81int get_core_id( unsigned int * cxy,
     82                 unsigned int * lid );
     83
     84/***************************************************************************************
     85 * This syscall returns the number of cores in a given cluster.
     86 ***************************************************************************************
     87 * @ cxy      : [in]  target cluster identifier.
     88 * @ ncores   : [out] number of cores in target cluster.
     89 * @ return always 0.
     90 **************************************************************************************/
     91int get_nb_cores( unsigned int   cxy,
     92                  unsigned int * ncores );
     93
     94/***************************************************************************************
     95 * This syscall uses the DQDT to search, in a macro-cluster specified by the
     96 * <cxy_base> and <level> arguments arguments, the core with the lowest load.
     97 * it writes in the <cxy> and <lid> buffers the selected core cluster identifier
     98 * and the local core index.
     99 ***************************************************************************************
     100 * @ cxy_base : [in]  any cluster identifier in macro-cluster.in clusters array.
     101 * @ level    : [in]  macro-cluster level in [1,2,3,4,5].
     102 * @ cxy      : [out] selected core cluster identifier.
     103 * @ lid      : [out] selectod core local index.
     104 * @ return 0 if success / 1 if no core in macro-cluster / -1 if illegal arguments.
     105 **************************************************************************************/
     106int get_best_core( unsigned int   cxy_base,
     107                   unsigned int   level,
     108                   unsigned int * cxy,
     109                   unsigned int * lid );
    82110
    83111/***************************************************************************************
    84  * This function returns the calling core cycles counter,
     112 * This function returns the value contained in the calling core cycles counter,
    85113 * taking into account a possible overflow on 32 bits architectures.
    86114 ***************************************************************************************
     
    414442                      unsigned int cxy );
    415443
     444/********* Non standard (ALMOS-MKH specific) pthread_parallel_create() syscall  *********/
     445
     446//////////////////////////////////////////////////////////////////////////////////////////
     447// This system call can be used to parallelize the creation and the termination
     448// of a parallel multi-threaded application. It removes the loop in the main thread that
     449// creates the N working threads (N  sequencial pthread_create() ). It also removes the
     450// loop that waits completion of these N working threads (N sequencial pthread_join() ).
     451// It creates one "work" thread (in detached mode) per core in the target architecture.
     452// Each "work" thread is identified by the [cxy][lid] indexes (cluster / local core).
     453// The pthread_parallel_create() function returns only when all "work" threads completed
     454// (successfully or not).
     455//
     456// To use this system call, the application code must define the following structures:
     457// - To define the arguments to pass to the <work> function the application must allocate
     458//   and initialize a first 2D array, indexed by [cxy] and [lid] indexes, where each slot
     459//   contains an application specific structure, and another 2D array, indexed by the same
     460//   indexes, containing pointers on these structures. This array of pointers is one
     461//   argument of the pthread_parallel_create() function.
     462// - To detect the completion of the <work> threads, the application must allocate a 1D
     463//   array, indexed by the cluster index [cxy], where each slot contains a pthread_barrier
     464//   descriptor. This barrier is initialised by the pthread_parallel_create() function,
     465//   in all cluster containing at least one work thread. This array of barriers is another
     466//   argument of the pthread_parallel_create() function.
     467//
     468// Implementation note:
     469// To parallelize the "work" threads creation and termination, the pthread_parallel_create()
     470// function creates a distributed quad-tree (DQT) of "build" threads covering all cores
     471// required to execute the parallel application.
     472// Depending on the hardware topology, this DQT can be truncated, (i.e. some
     473// parent nodes can have less than 4 chidren), if (x_size != y_size), or if one size
     474// is not a power of 2. Each "build" thread is identified by two indexes [cxy][level].
     475// Each "build" thread makes the following tasks:
     476// 1) It calls the pthread_create() function to create up to 4 children threads, that
     477//    are are "work" threads when (level == 0), or "build" threads, when (level > 0).
     478// 2) It initializes the barrier (global variable), used to block/unblock
     479//    the parent thread until children completion.
     480// 3) It calls the pthread_barrier_wait( self ) to wait until all children threads
     481//    completed (successfully or not).
     482// 4) It calls the pthread_barrier_wait( parent ) to unblock the parent thread.
     483//////////////////////////////////////////////////////////////////////////////////////////
     484
     485/*****************************************************************************************
     486 * This blocking function creates N working threads that execute the code defined
     487 * by the <work_func> and <work_args> arguments.
     488 * The number N of created threads is entirely defined by the <root_level> argument.
     489 * This value defines an abstract quad-tree, with a square base : level in [0,1,2,3,4],
     490 * side in [1,2,4,8,16], nclusters in [1,4,16,64,256]. This base is called  macro_cluster.
     491 * A working thread is created on all cores contained in the specified macro-cluster.
     492 * The actual number of physical clusters containing cores can be smaller than the number
     493 * of clusters covered by the quad tree. The actual number of cores in a cluster can be
     494 * less than the max value.
     495 *
     496 * In the current implementation, all threads execute the same <work_func> function,
     497 * on different arguments, that are specified as a 2D array of pointers <work_args>.
     498 * This can be modified in a future version, where the <work_func> argument can become
     499 * a 2D array of pointers, to have one specific function for each thread.
     500 *****************************************************************************************
     501 * @ root_level            : [in]  DQT root level in [0,1,2,3,4].
     502 * @ work_func             : [in]  pointer on start function.
     503 * @ work_args_array       : [in]  pointer on a 2D array of pointers.
     504 * @ parent_barriers_array : [in]  pointer on a 1D array of barriers.
     505 * @ return 0 if success / return -1 if failure.
     506 ****************************************************************************************/
     507int pthread_parallel_create( unsigned int   root_level,
     508                             void         * work_func,
     509                             void         * work_args_array,
     510                             void         * parent_barriers_array );
     511
    416512#endif /* _LIBALMOSMKH_H_ */
    417513
  • trunk/libs/libpthread/pthread.c

    r619 r637  
    230230
    231231////////////////////////////////////////////////////////////////////////////////////////////
    232 // The following functions define another implementation for the POSX barrier
    233 // based on a distributed quadtree implemented in user space, and relying
    234 // on a busy waiting policy.
    235 ////////////////////////////////////////////////////////////////////////////////////////////
    236 
    237 
    238 ////////////////////////////////////////////////////////////////////////////////////////////
    239 // This recursive function initializes the SQT nodes
    240 // traversing the SQT from root to bottom
    241 ////////////////////////////////////////////////////////////////////////////////////////////
    242 static void sqt_barrier_build( pthread_barrier_t  * barrier,
     232// The following functions define another implementation for the POSX barrier, based on
     233// a distributed quad tree implemented in user space, but using a busy waiting policy.
     234////////////////////////////////////////////////////////////////////////////////////////////
     235
     236
     237////////////////////////////////////////////////////////////////////////////////////////////
     238// This recursive function initializes the DQT nodes traversing the SQT from root to bottom
     239////////////////////////////////////////////////////////////////////////////////////////////
     240static void dqt_barrier_build( pthread_barrier_t  * barrier,
    243241                               unsigned int         x,
    244242                               unsigned int         y,
    245243                               unsigned int         level,
    246                                sqt_node_t         * parent,
     244                               dqt_node_t         * parent,
    247245                               unsigned int         x_size,
    248246                               unsigned int         y_size,
     
    250248{
    251249    // get target node address
    252     sqt_node_t * node = barrier->node[x][y][level];
     250    dqt_node_t * node = barrier->node[x][y][level];
    253251   
    254252    if (level == 0 )        // terminal case
     
    266264
    267265#if PTHREAD_BARRIER_DEBUG
    268 printf("\n[BARRIER] %s : sqt_node[%d][%d][%d] / arity %d / desc %x\n"
     266printf("\n[BARRIER] %s : dqt_node[%d][%d][%d] / arity %d / desc %x\n"
    269267"parent %x / child0 %x / child1 %x / child2 %x / child3 %x\n",
    270268__FUNCTION__, x, y, level, node->arity, node, node->parent,
     
    312310
    313311#if PTHREAD_BARRIER_DEBUG
    314 printf("\n[BARRIER] %s : sqt_node[%d][%d][%d] / arity %d / desc %x\n"
     312printf("\n[BARRIER] %s : dqt_node[%d][%d][%d] / arity %d / desc %x\n"
    315313"parent %x / child0 %x / child1 %x / child2 %x / child3 %x\n",
    316314__FUNCTION__, x, y, level, node->arity, node, node->parent,
     
    322320        {
    323321            if ( (cx[i] < x_size) && (cy[i] < y_size) )
    324             sqt_barrier_build( barrier,
     322            dqt_barrier_build( barrier,
    325323                               cx[i],
    326324                               cy[i],
     
    332330        }
    333331    }
    334 }  // end sqt_barrier_build()
     332}  // end dqt_barrier_build()
    335333
    336334////////////////////////////////////////////////////////////////
     
    394392                     ( (l == 4) && ((x&0x0F) == 0) && ((y&0x0F) == 0) ) )
    395393                 {
    396                      sqt_node_t * node = remote_malloc( sizeof(sqt_node_t) , cxy );
     394                     dqt_node_t * node = remote_malloc( sizeof(dqt_node_t) , cxy );
    397395
    398396                     if( node == NULL )
    399397                     {
    400                          printf("\n[ERROR] in %s : cannot allocate sqt_node in cluster %x\n",
     398                         printf("\n[ERROR] in %s : cannot allocate dqt_node in cluster %x\n",
    401399                         __FUNCTION__ , cxy );
    402400                         return -1;
     
    411409           
    412410    // recursively initialize all SQT nodes from root to bottom
    413     sqt_barrier_build( barrier,
     411    dqt_barrier_build( barrier,
    414412                       0,       
    415413                       0,
     
    428426//////////////////////////////////////////////////////////////////////////////////////////
    429427// This recursive function decrements the distributed "count" variables,
    430 // traversing the SQT from bottom to root.
     428// traversing the DQT from bottom to root.
    431429// The last arrived thread reset the local node before returning.
    432430//////////////////////////////////////////////////////////////////////////////////////////
    433 static void sqt_barrier_decrement( sqt_node_t * node )
     431static void dqt_barrier_decrement( dqt_node_t * node )
    434432{
    435433
     
    457455    {
    458456        // decrement the parent node if the current node is not the root
    459         if ( node->parent != NULL )  sqt_barrier_decrement( node->parent );
     457        if ( node->parent != NULL )  dqt_barrier_decrement( node->parent );
    460458
    461459#if PTHREAD_BARRIER_DEBUG
     
    484482        return;
    485483    }
    486 } // end sqt_barrier_decrement()
     484} // end dqt_barrier_decrement()
    487485   
    488486///////////////////////////////////////////////////////
     
    504502
    505503    // recursively decrement count from bottom to root
    506     sqt_barrier_decrement( barrier->node[x][y][0] );
     504    dqt_barrier_decrement( barrier->node[x][y][0] );
    507505
    508506    hal_user_fence();
  • trunk/libs/libpthread/pthread.h

    r632 r637  
    22 * pthread.h - User level <pthread> library definition.
    33 *
    4  * Author     Alain Greiner (2016,2017,2018)
     4 * Author     Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
  • trunk/libs/mini-libc/stdio.h

    r623 r637  
    22 * stdio.h - User level <stdio> library definition.
    33 *
    4  * Author     Alain Greiner (2016,2017,2018)
     4 * Author     Alain Greiner (2016,2017,2018,2019)
    55 *
    66 * Copyright (c) UPMC Sorbonne Universites
  • trunk/libs/mini-libc/stdlib.c

    r589 r637  
    148148void * malloc( unsigned int size )
    149149{
    150     // get cluster identifier
    151     unsigned int cxy;
    152     unsigned int lid;
    153     get_core( &cxy , &lid );
     150    unsigned int cxy;
     151    unsigned int lid;
     152
     153    // get cluster identifier
     154    get_core_id( &cxy , &lid );
    154155
    155156    return remote_malloc( size, cxy );
     
    160161                unsigned int size )
    161162{
    162     // get calling core cluster identifier
    163     unsigned int cxy;
    164     unsigned int lid;
    165     get_core( &cxy , &lid );
     163    unsigned int cxy;
     164    unsigned int lid;
     165
     166    // get cluster identifier
     167    get_core_id( &cxy , &lid );
    166168
    167169    return remote_calloc( count , size , cxy );
     
    172174                 unsigned int  size )
    173175{
    174     // get calling core cluster identifier
    175     unsigned int cxy;
    176     unsigned int lid;
    177     get_core( &cxy , &lid );
     176    unsigned int cxy;
     177    unsigned int lid;
     178
     179    // get cluster identifier
     180    get_core_id( &cxy , &lid );
    178181
    179182    return remote_realloc( ptr , size , cxy );
     
    183186void free( void * ptr )
    184187{
    185     // get calling core cluster identifier
    186     unsigned int cxy;
    187     unsigned int lid;
    188     get_core( &cxy , &lid );
     188    unsigned int cxy;
     189    unsigned int lid;
     190
     191    // get cluster identifier
     192    get_core_id( &cxy , &lid );
    189193
    190194    remote_free( ptr , cxy );
  • trunk/params-hard.mk

    r636 r637  
    22
    33ARCH      = /Users/alain/soc/tsar-trunk-svn-2013/platforms/tsar_generic_iob
    4 X_SIZE    = 1
     4X_SIZE    = 2
    55Y_SIZE    = 2
    6 NB_PROCS  = 4
     6NB_PROCS  = 2
    77NB_TTYS   = 2
    88IOC_TYPE  = IOC_BDV
  • trunk/user/fft/fft.c

    r636 r637  
    2222// of N complex points, using the Cooley-Tuckey FFT method.
    2323// The N data points are seen as a 2D array (rootN rows * rootN columns).
    24 // Each thread handle (rootN / nthreads) rows. The N input data points
    25 // be initialised in three different modes:
     24// Each thread handle (rootN / nthreads) rows.
     25// The N input data points can be initialised in three different modes:
    2626// - CONSTANT : all data points have the same [1,0] value
    2727// - COSIN    : data point n has [cos(n/N) , sin(n/N)] values
     
    3131//  - M : N = 2**M = number of data points / M must be an even number.
    3232//  - T : nthreads = ncores defined by the hardware / must be power of 2.
     33// The number of threads cannot be larger than the number of rows.
    3334//
    34 // This application uses 4 shared data arrays, that are dynamically
    35 // allocated an distributed, using the remote_malloc() function, with
    36 // one sub-buffer per cluster:
    37 // - data[N] contains N input data points, with 2 double per point.
    38 // - trans[N] contains N intermediate data points, 2 double per point.
    39 // - umain[rootN] contains rootN coefs required for a rootN points FFT.
    40 // - twid[N] contains N coefs : exp(2*pi*i*j/N) / i and j in [0,rootN-1].
    41 // For data, trans, twid, each sub-buffer contains (N/nclusters) points.
    42 // For umain, each sub-buffer contains (rootN/nclusters) points.
     35// This application uses 3 shared data arrays, that are dynamically
     36// allocated and distributed in clusters, with one sub-buffer per cluster:
     37// - data[N] contains N input data points,
     38// - trans[N] contains N intermediate data points,
     39// - twid[N] contains N coefs : exp(2*pi*i*j/N) / i and j in [0,rootN-1]
     40// Each sub-buffer contains (N/nclusters) entries, with 2 double per entry.
     41// These distributed buffers are allocated and initialised in parallel
     42// by the working threads running on core 0 in each cluster.
    4343//
    44 // There is one thread per core.
    45 // The max number of clusters is defined by (X_MAX * Y_MAX).
    46 // The max number of cores per cluster is defined by CORES_MAX.
     44// Each working thread allocates also a private coefs[rootN-1] buffer,
     45// that contains all coefs required for a rootN points FFT.
     46//
     47// There is one working thread per core.
     48// The actual number of cores and cluster in a given hardware architecture
     49// is obtained by the get_config() syscall (x_size, y_size, ncores).
     50// The max number of clusters is bounded by (X_MAX * Y_MAX).
     51// The max number of cores per cluster is bounded by CORES_MAX.
    4752//
    4853// Several configuration parameters can be defined below:
     
    5762//   by the main thread in the main() function.
    5863// - The parallel execution time (parallel_time[i]) is computed by each
    59 //   thread(i) in the slave() function.
     64//   working thread(i) in the work() function.
    6065// - The synchronisation time related to the barriers (sync_time[i])
    61 //   is computed by each thread(i) in the slave() function.
     66//   is computed by each thread(i) in the work() function.
    6267// The results are displayed on the TXT terminal, and registered on disk.
    6368///////////////////////////////////////////////////////////////////////////
     
    8792// parameters
    8893
    89 #define DEFAULT_M               12              // 4096 data points
    90 #define USE_DQT_BARRIER         0               // use DDT barrier if non zero
     94#define DEFAULT_M               14              // 16384 data points
     95#define USE_DQT_BARRIER         1               // use DDT barrier if non zero
    9196#define MODE                    COSIN           // DATA array initialisation mode
    9297#define CHECK                   0               
    93 #define DEBUG_MAIN              0               // trace main() function (detailed if odd)
    94 #define DEBUG_SLAVE             0               // trace slave() function (detailed if odd)
     98#define DEBUG_MAIN              1               // trace main() function (detailed if odd)
     99#define DEBUG_WORK              1               // trace work() function (detailed if odd)
    95100#define DEBUG_FFT1D             0               // trace FFT1D() function (detailed if odd)
    96101#define DEBUG_ROW               0               // trace FFTRow() function (detailed if odd)
     
    101106
    102107/////////////////////////////////////////////////////////////////////////////////////
    103 //             structure containing the arguments for the slave() function
     108//             FFT specific global variables
    104109/////////////////////////////////////////////////////////////////////////////////////
    105110
    106 typedef struct args_s
    107 {
    108     unsigned int   tid;                    // thread continuous index
    109     unsigned int   main_tid;               // main thread continuous index
     111// work function arguments
     112typedef struct work_args_s
     113{
     114    unsigned int        tid;               // thread continuous index
     115    unsigned int        lid;               // core local index
     116    unsigned int        cid;               // cluster continuous index
     117    pthread_barrier_t * parent_barrier;    // parent barrier to signal completion
    110118}
    111 args_t;
    112 
    113 /////////////////////////////////////////////////////////////////////////////////////
    114 //             global variables
    115 /////////////////////////////////////////////////////////////////////////////////////
    116 
    117 unsigned int   x_size;                     // number of clusters per row in the mesh
    118 unsigned int   y_size;                     // number of clusters per column in the mesh
    119 unsigned int   ncores;                     // number of cores per cluster
     119work_args_t;
     120
    120121unsigned int   nthreads;                   // total number of threads (one thread per core)
    121122unsigned int   nclusters;                  // total number of clusters
     
    129130double *       data[CLUSTERS_MAX];         // original time-domain data
    130131double *       trans[CLUSTERS_MAX];        // used as auxiliary space for transpose
     132double *       twid[CLUSTERS_MAX];         // twiddle factor : exp(-2iPI*k*n/N)
    131133double *       bloup[CLUSTERS_MAX];        // used as auxiliary space for DFT
    132 double *       umain[CLUSTERS_MAX];        // roots of unity used fo rootN points FFT   
    133 double *       twid[CLUSTERS_MAX];         // twiddle factor : exp(-2iPI*k*n/N)
    134134
    135135// instrumentation counters
     
    142142pthread_barrierattr_t  barrier_attr;
    143143
    144 // threads identifiers, attributes, and arguments
    145 pthread_t       trdid[THREADS_MAX];        // kernel threads identifiers
    146 pthread_attr_t  attr[THREADS_MAX];         // POSIX thread attributes
    147 args_t          args[THREADS_MAX];         // slave function arguments
    148 
    149 /////////////////////////////////////////////////////////////////////////////////
     144/////////////////////////////////////////////////////////////////////////////////////
     145//             Global variables required by parallel_pthread_create()
     146/////////////////////////////////////////////////////////////////////////////////////
     147
     148// 2D arrays of input arguments for the <work> threads
     149// These arrays are initialised by the application main thread
     150
     151work_args_t       work_args[CLUSTERS_MAX][CORES_MAX];  // work function arguments
     152work_args_t     * work_ptrs[CLUSTERS_MAX][CORES_MAX];  // pointers on arguments
     153
     154// 1D array of barriers to allow the <work> threads to signal termination
     155// this array is initialised in each cluster by the <build[cxy][0]> thread
     156 
     157pthread_barrier_t parent_barriers[CLUSTERS_MAX];        // termination barrier
     158
     159/////////////////////////////////////////////////////////////////////////////////////
    150160//           functions declaration
    151 /////////////////////////////////////////////////////////////////////////////////
    152 
    153 void slave( args_t * args );
     161/////////////////////////////////////////////////////////////////////////////////////
     162
     163void work( work_args_t * args );
    154164
    155165double CheckSum( void );
    156166
    157 void InitX(double ** x , unsigned int mode);
    158 
    159 void InitU(double ** u);
    160 
    161 void InitT(double ** u);
     167void InitD( double    ** data ,
     168            unsigned int mode,
     169            unsigned int tid );
     170
     171void InitT( double    ** twid,
     172            unsigned int tid );
     173
     174void InitU( double * coefs );
    162175
    163176unsigned int BitReverse( unsigned int k );
     
    168181            double     * upriv,
    169182            double    ** twid,
    170             unsigned int MyNum,
     183            unsigned int tid,
    171184            unsigned int MyFirst,
    172185            unsigned int MyLast );
     
    217230    int                 error;
    218231
    219     unsigned int        main_cxy;          // main thread cluster
    220     unsigned int        main_x;            // main thread X coordinate
    221     unsigned int        main_y;            // main thread y coordinate
    222     unsigned int        main_lid;          // main thread local core index
    223     unsigned int        main_tid;          // main thread continuous index
     232    unsigned int        x_size;            // number of clusters per row
     233    unsigned int        y_size;            // number of clusters per column
     234    unsigned int        ncores;            // max number of cores per cluster
    224235
    225236    unsigned int        x;                 // current index for cluster X coordinate
    226237    unsigned int        y;                 // current index for cluster Y coordinate
    227238    unsigned int        lid;               // current index for core in a cluster
    228     unsigned int        ci;                // continuous cluster index (from x,y)
     239    unsigned int        tid;               // continuous thread index
     240    unsigned int        cid;               // cluster continuous index
    229241    unsigned int        cxy;               // hardware specific cluster identifier
    230     unsigned int        tid;               // continuous thread index
     242
     243    char                name[64];          // instrumentation file name
     244    char                path[128];         // instrumentation path name
     245    char                string[256];
     246    int                 ret;
    231247
    232248    unsigned long long  start_init_cycle;
    233249    unsigned long long  end_init_cycle;
    234250
     251#if DEBUG_MAIN
     252    unsigned long long  debug_cycle;
     253#endif
     254
    235255#if CHECK
    236 double     ck1;           // for input/output checking
    237 double     ck3;           // for input/output checking
     256    double              ck1;               // for input/output checking
     257    double              ck3;               // for input/output checking
    238258#endif
    239259   
     
    241261    get_cycle( &start_init_cycle );
    242262
    243     // get platform parameters to compute nthreads & nclusters
     263    // get platform parameters
    244264    if( get_config( &x_size , &y_size , &ncores ) )
    245265    {
     
    269289    }
    270290
     291    // compute nthreads and nclusters
    271292    nthreads  = x_size * y_size * ncores;
    272293    nclusters = x_size * y_size;
     294
     295    // compute covering DQT size an level
     296    unsigned int z = (x_size > y_size) ? x_size : y_size;
     297    unsigned int root_level = (z == 1) ? 0 : (z == 2) ? 1 : (z == 4) ? 2 : (z == 8) ? 3 : 4;
    273298
    274299    // compute various constants depending on N and T
     
    285310    }
    286311
    287     // get main thread coordinates (main_x, main_y, main_lid)
    288     get_core( &main_cxy , &main_lid );
    289     main_x   = HAL_X_FROM_CXY( main_cxy );
    290     main_y   = HAL_Y_FROM_CXY( main_cxy );
    291     main_tid = (((main_x * y_size) + main_y) * ncores) + main_lid;
    292 
    293     printf("\n[fft] starts / core[%x,%d] / %d points / %d thread(s) / PID %x / cycle %d\n",
    294     main_cxy, main_lid, N, nthreads, getpid(), (unsigned int)start_init_cycle );
    295 
    296     // allocate memory for the distributed data[i], trans[i], umain[i], twid[i] buffers
    297     // the index (i) is a continuous cluster index
    298     unsigned int data_size   = (N / nclusters) * 2 * sizeof(double);
    299     unsigned int coefs_size  = (rootN / nclusters) * 2 * sizeof(double);
    300     for (x = 0 ; x < x_size ; x++)
    301     {
    302         for (y = 0 ; y < y_size ; y++)
    303         {
    304             ci         = x * y_size + y;
    305             cxy        = HAL_CXY_FROM_XY( x , y );
    306             data[ci]   = (double *)remote_malloc( data_size  , cxy );
    307             trans[ci]  = (double *)remote_malloc( data_size  , cxy );
    308             bloup[ci]  = (double *)remote_malloc( data_size  , cxy );
    309             umain[ci]  = (double *)remote_malloc( coefs_size , cxy );
    310             twid[ci]   = (double *)remote_malloc( data_size  , cxy );
    311         }
     312    printf("\n[fft] starts / %d points / %d thread(s) / PID %x / cycle %d\n",
     313    N, nthreads, getpid(), (unsigned int)start_init_cycle );
     314
     315    // build instrumentation file name
     316    if( USE_DQT_BARRIER )
     317    snprintf( name , 64 , "p_fft_dqt_%d_%d_%d", N , x_size * y_size , ncores );
     318    else
     319    snprintf( name , 64 , "p_fft_smp_%d_%d_%d", N , x_size * y_size , ncores );
     320
     321    // build pathname
     322    snprintf( path , 128 , "/home/%s", name );
     323
     324    // open instrumentation file
     325    FILE * f = fopen( path , NULL );
     326    if ( f == NULL )
     327    {
     328        printf("\n[fft error] cannot open instrumentation file <%s>\n", path );
     329        exit( 0 );
    312330    }
    313331
    314332#if DEBUG_MAIN
    315 printf("\n[fft] main completes remote_malloc\n");
    316 #endif
    317 
    318     // arrays initialisation
    319     InitX( data , MODE );
    320     InitU( umain );
    321     InitT( twid );
    322 
    323 #if DEBUG_MAIN
    324 printf("\n[fft] main completes arrays init\n");
     333get_cycle( &debug_cycle );
     334printf("\n[fft] main open file <%s> at cycle %d\n",
     335path, (unsigned int)debug_cycle );
    325336#endif
    326337
     
    342353#endif
    343354
    344     // initialise barrier
     355    // initialise barrier synchronizing all <work> threads
    345356    if( USE_DQT_BARRIER )
    346357    {
     
    362373
    363374#if DEBUG_MAIN
    364 printf("\n[fft] main completes barrier init\n");
    365 #endif
    366 
    367     // launch other threads to execute the slave() function
    368     // on cores other than the core running the main thread
     375get_cycle( &debug_cycle );
     376printf("\n[fft] main completes barrier init at cycle %d\n",
     377(unsigned int)debug_cycle );
     378#endif
     379
     380    // build array of arguments for the <work> threads
    369381    for (x = 0 ; x < x_size ; x++)
    370382    {
     
    376388            for ( lid = 0 ; lid < ncores ; lid++ )
    377389            {
    378                 // compute thread user index (continuous index)
    379                 tid = (((x * y_size) + y) * ncores) + lid;
    380 
    381                 // set thread attributes
    382                 attr[tid].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED;
    383                 attr[tid].cxy        = cxy;
    384                 attr[tid].lid        = lid;
    385 
    386                 // set slave function argument
    387                 args[tid].tid      = tid;
    388                 args[tid].main_tid = main_tid;
    389 
    390                 // create thread
    391                 if( tid != main_tid )
    392                 {
    393                     if ( pthread_create( &trdid[tid],  // pointer on kernel identifier
    394                                          &attr[tid],   // pointer on thread attributes
    395                                          &slave,       // pointer on function
    396                                          &args[tid]) ) // pointer on function arguments
    397                     {
    398                         printf("\n[fft error] creating thread %x\n", tid );
    399                         exit( 0 );
    400                     }
    401 
    402 #if (DEBUG_MAIN & 1)
    403 unsigned long long debug_cycle;
    404 get_cycle( &debug_cycle );
    405 printf("\n[fft] main created thread %d on core[%x,%d] / cycle %d\n",
    406 tid, cxy, lid, (unsigned int)debug_cycle );
    407 #endif
    408                 }
     390                // compute cluster continuous index
     391                cid = (x * y_size) + y;
     392
     393                // compute work thread continuous index
     394                tid = (cid * ncores) + lid;
     395               
     396                // initialize 2D array of arguments
     397                work_args[cxy][lid].tid            = tid;
     398                work_args[cxy][lid].lid            = lid;
     399                work_args[cxy][lid].cid            = cid;
     400                work_args[cxy][lid].parent_barrier = &parent_barriers[cxy];
     401
     402                // initialize 2D array of pointers
     403                work_ptrs[cxy][lid] = &work_args[cxy][lid];
    409404            }
    410405        }
    411406    }
    412407
     408    // register sequencial time
     409    get_cycle( &end_init_cycle );
     410    init_time = (unsigned int)(end_init_cycle - start_init_cycle);
     411
    413412#if DEBUG_MAIN
    414 printf("\n[fft] main completes threads creation\n");
    415 #endif
    416 
    417     get_cycle( &end_init_cycle );
    418 
    419     // register sequencial time
    420     init_time = (unsigned int)(end_init_cycle - start_init_cycle);
    421    
    422     // main itself executes the slave() function
    423     slave( &args[main_tid] );
    424 
    425     // wait other threads completion
    426     for (x = 0 ; x < x_size ; x++)
    427     {
    428         for (y = 0 ; y < y_size ; y++)
    429         {
    430             for ( lid = 0 ; lid < ncores ; lid++ )
    431             {
    432                 // compute thread continuous index
    433                 tid = (((x * y_size) + y) * ncores) + lid;
    434 
    435                 if( tid != main_tid )
    436                 {
    437                     if( pthread_join( trdid[tid] , NULL ) )
    438                     {
    439                         printf("\n[fft error] in main thread joining thread %x\n", tid );
    440                         exit( 0 );
    441                     }
    442                    
    443 #if (DEBUG_MAIN & 1)
    444 printf("\n[fft] main thread %d joined thread %d\n", main_tid, tid );
    445 #endif
    446 
    447                 }
    448             }
    449         }
    450     }
     413printf("\n[fft] main completes <work> threads arguments at cycle %d\n",
     414(unsigned int)end_init_cycle );
     415#endif
     416
     417    // create and execute the working threads
     418    if( pthread_parallel_create( root_level,
     419                                 &work,
     420                                 &work_ptrs[0][0],
     421                                 &parent_barriers[0] ) )
     422    {
     423        printf("\n[fft error] creating threads\n");
     424        exit( 0 );
     425    }
     426
     427#if DEBUG_MAIN
     428get_cycle( &debug_cycle );
     429printf("\n[fft] main resume for instrumentation at cycle %d\n",
     430(unsigned int)debug_cycle) ;
     431#endif
    451432
    452433#if PRINT_ARRAY
     
    463444#endif
    464445
    465     // instrumentation
    466     char name[64];
    467     char path[128];
    468     char string[256];
    469     int  ret;
    470 
    471     // build file name
    472     if( USE_DQT_BARRIER )
    473     snprintf( name , 64 , "fft_dqt_%d_%d_%d", N , x_size * y_size , ncores );
    474     else
    475     snprintf( name , 64 , "fft_smp_%d_%d_%d", N , x_size * y_size , ncores );
    476 
    477     // build pathname
    478     snprintf( path , 128 , "/home/%s", name );
    479 
    480     // open instrumentation file
    481     FILE * f = fopen( path , NULL );
    482     if ( f == NULL )
    483     {
    484         printf("\n[fft error] cannot open instrumentation file <%s>\n", path );
    485         exit( 0 );
    486     }
    487     printf("\n[fft] file <%s> open\n", path );
    488 
    489446    // display header on terminal, and save to file
    490447    printf("\n----- %s -----\n", name );
     
    497454    }
    498455
    499     // display results for each thread on terminal, and save to file
     456    // get instrumentation results for each thread
    500457    for (tid = 0 ; tid < nthreads ; tid++)
    501458    {
     
    503460        tid, init_time, parallel_time[tid], sync_time[tid] );
    504461
    505         // display on terminal, and save to instrumentation file
    506         printf("%s" , string );
     462        // save  to instrumentation file
    507463        fprintf( f , "%s" , string );
    508464        if( ret < 0 )
    509465        {
    510466            printf("\n[fft error] cannot write thread %d to file <%s>\n", tid, path );
     467            printf("%s", string );
    511468            exit(0);
    512469        }
    513470    }
    514471
    515     // display MIN/MAX values on terminal and save to file
     472    // compute min/max values
    516473    unsigned int min_para = parallel_time[0];
    517474    unsigned int max_para = parallel_time[0];
     
    527484    }
    528485
     486    // display MIN/MAX values on terminal and save to file
    529487    snprintf( string , 256 , "\n      Sequencial  Parallel       Barrier\n"
    530488                             "MIN : %d\t | %d\t | %d\t   (cycles)\n"
     
    547505        exit(0);
    548506    }
    549     printf("\n[fft] file <%s> closed\n", path );
     507 
     508#if DEBUG_MAIN
     509get_cycle( &debug_cycle );
     510printf("\n[fft] main close file <%s> at cycle %d\n",
     511path, (unsigned int)debug_cycle );
     512#endif
    550513
    551514    exit( 0 );
     
    553516} // end main()
    554517
    555 ///////////////////////////////////////////////////////////////
    556 // This function is executed in parallel by all threads.
    557 ///////////////////////////////////////////////////////////////
    558 void slave( args_t * args )
    559 {
    560     unsigned int   i;
    561     unsigned int   MyNum;           // this thread index
    562     unsigned int   MainNum;         // main thread index
    563     unsigned int   MyFirst;         // index first row allocated to thread
    564     unsigned int   MyLast;          // index last row allocated to thread
    565     double       * upriv;
    566     unsigned int   c_id;
    567     unsigned int   c_offset;
     518/////////////////////////////////////////////////////////////////
     519// This function is executed in parallel by all <work> threads.
     520/////////////////////////////////////////////////////////////////
     521void work( work_args_t * args )
     522{
     523    unsigned int        tid;              // this thread continuous index
     524    unsigned int        lid;              // core local index
     525    unsigned int        cid;              // cluster continuous index
     526    pthread_barrier_t * parent_barrier;   // pointer on parent barrier
     527
     528    unsigned int        MyFirst;          // index first row allocated to thread
     529    unsigned int        MyLast;           // index last row allocated to thread
     530    double            * upriv;            // private array of FFT coefs
    568531
    569532    unsigned long long  parallel_start;
     
    572535    unsigned long long  barrier_stop;
    573536
    574     MyNum   = args->tid;
    575     MainNum = args->main_tid;
     537    // get thread arguments
     538    tid            = args->tid;
     539    lid            = args->lid;             
     540    cid            = args->cid;             
     541    parent_barrier = args->parent_barrier;
    576542
    577543    get_cycle( &parallel_start );
    578544
    579 #if DEBUG_SLAVE
     545#if DEBUG_WORK
    580546printf("\n[fft] %s : thread %d enter / cycle %d\n",
    581 __FUNCTION__, MyNum, (unsigned int)parallel_start );
    582 #endif
     547__FUNCTION__, tid, (unsigned int)parallel_start );
     548#endif
     549
     550    // core 0 allocate memory from the local cluster
     551    // for the distributed data[], trans[], twid[] buffers
     552    // and for the private upriv[] buffer
     553    if( lid == 0 )
     554    {
     555        unsigned int data_size  = (N / nclusters) * 2 * sizeof(double);
     556        unsigned int coefs_size = (rootN - 1) * 2 * sizeof(double); 
     557
     558        data[cid]   = (double *)malloc( data_size );
     559        trans[cid]  = (double *)malloc( data_size );
     560        twid[cid]   = (double *)malloc( data_size );
     561
     562        upriv       = (double *)malloc( coefs_size );
     563    }
    583564
    584565    // BARRIER
     
    586567    pthread_barrier_wait( &barrier );
    587568    get_cycle( &barrier_stop );
    588     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
    589 
    590 #if DEBUG_SLAVE
    591 printf("\n[@@@] %s : thread %d exit first barrier / cycle %d\n",
    592 __FUNCTION__, MyNum, (unsigned int)barrier_stop );
    593 #endif
    594 
    595     // allocate and initialise local array upriv[]
    596     // that is a local copy of the rootN coefs defined in umain[]
    597     upriv = (double *)malloc(2 * (rootN - 1) * sizeof(double)); 
    598     for ( i = 0 ; i < (rootN - 1) ; i++)
    599     {
    600         c_id     = i / (rootN / nclusters);
    601         c_offset = i % (rootN / nclusters);
    602         upriv[2*i]   = umain[c_id][2*c_offset];
    603         upriv[2*i+1] = umain[c_id][2*c_offset+1];
    604     }
     569    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     570
     571#if DEBUG_WORK
     572printf("\n[fft] %s : thread %d exit first barrier / cycle %d\n",
     573__FUNCTION__, tid, (unsigned int)barrier_stop );
     574#endif
     575
     576    // all threads initialize data[] local array
     577    InitD( data , MODE , tid );
     578
     579    // all threads initialize twid[] local array
     580    InitT( twid , tid );
     581   
     582    // all threads initialise private upriv[] array
     583    InitU( upriv );
     584
     585    // BARRIER
     586    get_cycle( &barrier_start );
     587    pthread_barrier_wait( &barrier );
     588    get_cycle( &barrier_stop );
     589    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     590
     591#if DEBUG_WORK
     592printf("\n[fft] %s : thread %d exit second barrier / cycle %d\n",
     593__FUNCTION__, tid, (unsigned int)barrier_stop );
     594#endif
    605595
    606596    // compute first and last rows handled by the thread
    607     MyFirst = rootN * MyNum / nthreads;
    608     MyLast  = rootN * (MyNum + 1) / nthreads;
     597    MyFirst = rootN * tid / nthreads;
     598    MyLast  = rootN * (tid + 1) / nthreads;
    609599
    610600    // perform forward FFT
    611     FFT1D( 1 , data , trans , upriv , twid , MyNum , MyFirst , MyLast );
     601    FFT1D( 1 , data , trans , upriv , twid , tid , MyFirst , MyLast );
    612602
    613603#if CHECK
     
    615605pthread_barrier_wait( &barrier );
    616606get_cycle( &barrier_stop );
    617 sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
    618 FFT1D( -1 , data , trans , upriv , twid , MyNum , MyFirst , MyLast );
     607sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     608FFT1D( -1 , data , trans , upriv , twid , tid , MyFirst , MyLast );
    619609#endif
    620610
     
    622612
    623613    // register parallel time
    624     parallel_time[MyNum] = (unsigned int)(parallel_stop - parallel_start);
    625 
    626 #if DEBUG_SLAVE
    627 printf("\n[fft] %s : thread %x completes fft / p_start %d / p_stop %d\n",
    628 __FUNCTION__, MyNum, (unsigned int)parallel_start, (unsigned int)parallel_stop );
    629 int tid;
    630 for (tid = 0 ; tid < nthreads ; tid++)
    631 {
    632     printf("- tid %d : Sequencial %d / Parallel %d / Barrier %d\n",
    633     tid , init_time, parallel_time[tid], sync_time[tid] );
    634 }
    635 #endif
    636 
    637     // exit only if MyNum != MainNum
    638     if( MyNum != MainNum ) pthread_exit( NULL );
    639 
    640 }  // end slave()
     614    parallel_time[tid] = (unsigned int)(parallel_stop - parallel_start);
     615
     616#if DEBUG_WORK
     617printf("\n[fft] %s : thread %d completes fft / p_start %d / p_stop %d\n",
     618__FUNCTION__, tid, (unsigned int)parallel_start, (unsigned int)parallel_stop );
     619#endif
     620
     621    //  work thread signals completion to main
     622    pthread_barrier_wait( parent_barrier );
     623
     624#if DEBUG_WORK
     625printf("\n[fft] %s : thread %d exit\n",
     626__FUNCTION__, tid );
     627#endif
     628
     629    //  work thread exit
     630    pthread_exit( NULL );
     631
     632}  // end work()
    641633
    642634////////////////////////////////////////////////////////////////////////////////////////
     
    724716}
    725717
    726 
    727 ////////////////////////////
    728 void InitX(double      ** x,
    729            unsigned int   mode )
     718//////////////////////////////////////////////////////////////////////////////////////
     719// Each working thread <tid> contributes to initialize (rootN / nthreads) rows,
     720// in the shared - and distributed - <data> array.
     721//////////////////////////////////////////////////////////////////////////////////////
     722void InitD(double      ** data,
     723           unsigned int   mode,
     724           unsigned int   tid )
    730725{
    731726    unsigned int    i , j;
     
    734729    unsigned int    index;
    735730
    736     for ( j = 0 ; j < rootN ; j++ )      // loop on row index
     731    // compute row_min and row_max
     732    unsigned int    row_min = tid * rows_per_thread;
     733    unsigned int    row_max = row_min + rows_per_thread;
     734
     735    for ( j = row_min ; j < row_max ; j++ )      // loop on rows
    737736    { 
    738         for ( i = 0 ; i < rootN ; i++ )  // loop on point in a row
     737        for ( i = 0 ; i < rootN ; i++ )          // loop on points in a row
    739738        { 
    740739            index     = j * rootN + i;
     
    745744            if ( mode == RANDOM )               
    746745            {
    747                 x[c_id][2*c_offset]   = ( (double)rand() ) / 65536;
    748                 x[c_id][2*c_offset+1] = ( (double)rand() ) / 65536;
     746                data[c_id][2*c_offset]   = ( (double)rand() ) / 65536;
     747                data[c_id][2*c_offset+1] = ( (double)rand() ) / 65536;
    749748            }
    750749           
     
    754753            {
    755754                double phi = (double)( 2 * PI * index) / N;
    756                 x[c_id][2*c_offset]   = cos( phi );
    757                 x[c_id][2*c_offset+1] = sin( phi );
     755                data[c_id][2*c_offset]   = cos( phi );
     756                data[c_id][2*c_offset+1] = sin( phi );
    758757            }
    759758
     
    761760            if ( mode == CONSTANT )               
    762761            {
    763                 x[c_id][2*c_offset]   = 1.0;
    764                 x[c_id][2*c_offset+1] = 0.0;
     762                data[c_id][2*c_offset]   = 1.0;
     763                data[c_id][2*c_offset+1] = 0.0;
    765764            }
    766765        }
     
    768767}
    769768
    770 /////////////////////////
    771 void InitU( double ** u )
    772 {
    773     unsigned int    q;
    774     unsigned int    j;
    775     unsigned int    base;
    776     unsigned int    n1;
    777     unsigned int    c_id;
    778     unsigned int    c_offset;
    779     double  phi;
    780     unsigned int    stop = 0;
    781 
    782     for (q = 0 ; ((unsigned int)(1 << q) < N) && (stop == 0) ; q++)
    783     { 
    784         n1 = 1 << q;
    785         base = n1 - 1;
    786         for (j = 0; (j < n1) && (stop == 0) ; j++)
    787         {
    788             if (base + j > rootN - 1) return;
    789 
    790             c_id      = (base + j) / (rootN / nclusters);
    791             c_offset  = (base + j) % (rootN / nclusters);
    792             phi = (double)(2.0 * PI * j) / (2 * n1);
    793             u[c_id][2*c_offset]   = cos( phi );
    794             u[c_id][2*c_offset+1] = -sin( phi );
    795         }
    796     }
    797 }
    798 
    799 //////////////////////////
    800 void InitT( double ** u )
     769///////////////////////////////////////////////////////////////////////////////////////
     770// Each working thread <tid> contributes to initialize (rootN / nthreads) rows,
     771// in the shared - and distributed - <twiddle> array.
     772///////////////////////////////////////////////////////////////////////////////////////
     773void InitT( double      ** twid,
     774            unsigned int   tid )
    801775{
    802776    unsigned int    i, j;
     
    806780    double  phi;
    807781
    808     for ( j = 0 ; j < rootN ; j++ )      // loop on row index
     782    // compute row_min and row_max
     783    unsigned int    row_min = tid * rows_per_thread;
     784    unsigned int    row_max = row_min + rows_per_thread;
     785
     786    for ( j = row_min ; j < row_max ; j++ )      // loop on rows
    809787    { 
    810         for ( i = 0 ; i < rootN ; i++ )  // loop on points in a row
     788        for ( i = 0 ; i < rootN ; i++ )          // loop on points in a row
    811789        { 
    812790            index     = j * rootN + i;
     
    815793
    816794            phi = (double)(2.0 * PI * i * j) / N;
    817             u[c_id][2*c_offset]   = cos( phi );
    818             u[c_id][2*c_offset+1] = -sin( phi );
     795            twid[c_id][2*c_offset]   = cos( phi );
     796            twid[c_id][2*c_offset+1] = -sin( phi );
     797        }
     798    }
     799}
     800
     801///////////////////////////////////////////////////////////////////////////////////////
     802// Each working thread initialize the private <upriv> array / (rootN - 1) entries.
     803///////////////////////////////////////////////////////////////////////////////////////
     804void InitU( double * upriv )
     805{
     806    unsigned int    q;
     807    unsigned int    j;
     808    unsigned int    base;
     809    unsigned int    n1;
     810    double  phi;
     811
     812    for (q = 0 ; ((unsigned int)(1 << q) < N) ; q++)
     813    { 
     814        n1 = 1 << q;    // n1 == 2**q
     815        base = n1 - 1;
     816        for (j = 0; (j < n1) ; j++)
     817        {
     818            if (base + j > rootN - 1) return;
     819
     820            phi = (double)(2.0 * PI * j) / (2 * n1);
     821            upriv[2*(base+j)]   = cos( phi );
     822            upriv[2*(base+j)+1] = -sin( phi );
    819823        }
    820824    }
     
    856860            double        *  upriv,           // local array containing coefs for rootN FFT
    857861            double       **  twid,            // distributed arrays containing N twiddle factors
    858             unsigned int     MyNum,           // thread continuous index
     862            unsigned int     tid,             // thread continuous index
    859863            unsigned int     MyFirst,
    860864            unsigned int     MyLast )
     
    868872get_cycle( &cycle );
    869873printf("\n[fft] %s : thread %d enter / first %d / last %d / cycle %d\n",
    870 __FUNCTION__, MyNum, MyFirst, MyLast, (unsigned int)cycle );
     874__FUNCTION__, tid, MyFirst, MyLast, (unsigned int)cycle );
    871875#endif
    872876
     
    877881get_cycle( &cycle );
    878882printf("\n[fft] %s : thread %d after first transpose / cycle %d\n",
    879 __FUNCTION__, MyNum, (unsigned int)cycle );
     883__FUNCTION__, tid, (unsigned int)cycle );
    880884if( PRINT_ARRAY ) PrintArray( tmp , N );
    881885#endif
     
    885889    pthread_barrier_wait( &barrier );
    886890    get_cycle( &barrier_stop );
    887     sync_time[MyNum] = (unsigned int)(barrier_stop - barrier_start);
     891    sync_time[tid] = (unsigned int)(barrier_stop - barrier_start);
    888892
    889893#if( DEBUG_FFT1D & 1 )
    890894get_cycle( &cycle );
    891895printf("\n[fft] %s : thread %d exit barrier after first transpose / cycle %d\n",
    892 __FUNCTION__, MyNum, (unsigned int)cycle );
     896__FUNCTION__, tid, (unsigned int)cycle );
    893897#endif
    894898
     
    902906
    903907#if( DEBUG_FFT1D & 1 )
    904 printf("\n[fft] %s : thread %d after first twiddle\n", __FUNCTION__, MyNum);
     908printf("\n[fft] %s : thread %d after first twiddle\n", __FUNCTION__, tid);
    905909if( PRINT_ARRAY ) PrintArray( tmp , N );
    906910#endif
     
    912916
    913917#if( DEBUG_FFT1D & 1 )
    914 printf("\n[fft] %s : thread %d exit barrier after first twiddle\n", __FUNCTION__, MyNum);
    915 #endif
    916 
    917     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
     918printf("\n[fft] %s : thread %d exit barrier after first twiddle\n", __FUNCTION__, tid);
     919#endif
     920
     921    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    918922
    919923    // transpose tmp to x
     
    921925
    922926#if( DEBUG_FFT1D & 1 )
    923 printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, MyNum);
     927printf("\n[fft] %s : thread %d after second transpose\n", __FUNCTION__, tid);
    924928if( PRINT_ARRAY ) PrintArray( x , N );
    925929#endif
     
    931935
    932936#if( DEBUG_FFT1D & 1 )
    933 printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, MyNum);
    934 #endif
    935 
    936     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
     937printf("\n[fft] %s : thread %d exit barrier after second transpose\n", __FUNCTION__, tid);
     938#endif
     939
     940    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    937941
    938942    // do FFTs on rows of x and apply the scaling factor
     
    944948
    945949#if( DEBUG_FFT1D & 1 )
    946 printf("\n[fft] %s : thread %d after FFT on rows\n", __FUNCTION__, MyNum);
     950printf("\n[fft] %s : thread %d after FFT on rows\n", __FUNCTION__, tid);
    947951if( PRINT_ARRAY ) PrintArray( x , N );
    948952#endif
     
    954958
    955959#if( DEBUG_FFT1D & 1 )
    956 printf("\n[fft] %s : thread %d exit barrier after FFT on rows\n", __FUNCTION__, MyNum);
    957 #endif
    958     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
     960printf("\n[fft] %s : thread %d exit barrier after FFT on rows\n", __FUNCTION__, tid);
     961#endif
     962    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
    959963
    960964    // transpose x to tmp
     
    962966
    963967#if( DEBUG_FFT1D & 1 )
    964 printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, MyNum);
     968printf("\n[fft] %s : thread %x after third transpose\n", __FUNCTION__, tid);
    965969if( PRINT_ARRAY ) PrintArray( x , N );
    966970#endif
     
    972976
    973977#if( DEBUG_FFT1D & 1 )
    974 printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, MyNum);
    975 #endif
    976 
    977     sync_time[MyNum] += (unsigned int)(barrier_stop - barrier_start);
    978     sync_time[MyNum] += (long)(barrier_stop - barrier_start);
     978printf("\n[fft] %s : thread %d exit barrier after third transpose\n", __FUNCTION__, tid);
     979#endif
     980
     981    sync_time[tid] += (unsigned int)(barrier_stop - barrier_start);
     982    sync_time[tid] += (long)(barrier_stop - barrier_start);
    979983
    980984    // copy tmp to x
     
    982986
    983987#if DEBUG_FFT1D
    984 printf("\n[fft] %s : thread %d completed\n", __FUNCTION__, MyNum);
     988printf("\n[fft] %s : thread %d completed\n", __FUNCTION__, tid);
    985989if( PRINT_ARRAY ) PrintArray( x , N );
    986990#endif
  • trunk/user/idbg/idbg.c

    r580 r637  
    2020
    2121    get_cycle( &cycle );
    22     get_core( &cxy , &lid );
     22    get_core_id( &cxy , &lid );
    2323
    2424    printf( "\n[IDBG] starts on core[%x,%d] / cycle %d\n",
  • trunk/user/ksh/ksh.c

    r636 r637  
    11861186        char           cmd[CMD_MAX_SIZE];               // buffer for one command
    11871187
    1188 // 1. first direct command
     1188/* 1. first direct command
    11891189if( sem_wait( &semaphore ) )
    11901190{
     
    11991199strcpy( cmd , "load bin/user/sort.elf" );
    12001200execute( cmd );
    1201 //
    1202 
    1203 
    1204 
    1205 // 2. second direct command
     1201*/
     1202
     1203
     1204
     1205/* 2. second direct command
    12061206if( sem_wait( &semaphore ) )
    12071207{
     
    12161216strcpy( cmd , "load bin/user/fft.elf" );
    12171217execute( cmd );
    1218 //
     1218*/
    12191219
    12201220
     
    14551455    // get KSH process pid and core
    14561456    parent_pid = getpid();
    1457     get_core( &cxy , &lid );
     1457    get_core_id( &cxy , &lid );
    14581458
    14591459#if DEBUG_MAIN
  • trunk/user/pgcd/pgcd.c

    r626 r637  
    2727
    2828    get_cycle( &cycle );
    29     get_core( &cxy , &lid );
     29    get_core_id( &cxy , &lid );
    3030
    3131    printf( "\n[pgcd] starts on core[%x,%d] / cycle %d\n\n",
  • trunk/user/sort/sort.c

    r636 r637  
    5454#include <hal_macros.h>
    5555
    56 #define ARRAY_LENGTH        2048       // number of items
    57 #define MAX_THREADS         1024       // 16 * 16 * 4
    58 
    59 #define USE_DQT_BARRIER     1          // use DQT barrier if non zero
    60 #define DISPLAY_ARRAY       0          // display items values before and after
    61 #define DEBUG_MAIN          0          // trace main function
    62 #define DEBUG_SORT          0          // trace sort function
    63 #define CHECK_RESULT        0          // for debug
    64 #define INSTRUMENTATION     1          // register computation times on file
    65 
    66 /////////////////////////////////////////////////////////////
    67 // argument for the sort() function (one thread per core)
    68 /////////////////////////////////////////////////////////////
     56#define ARRAY_LENGTH        2048            // number of items
     57#define MAX_THREADS         1024            // 16 * 16 * 4
     58
     59#define X_MAX               16              // max number of clusters in a row
     60#define Y_MAX               16              // max number of clusters in a column
     61#define CORES_MAX           4               // max number of cores in a cluster
     62#define CLUSTERS_MAX        X_MAX * Y_MAX
     63
     64#define USE_DQT_BARRIER     1               // use DQT barrier if non zero
     65#define DISPLAY_ARRAY       0               // display items values before and after
     66#define DEBUG_MAIN          0               // trace main function
     67#define DEBUG_SORT          0               // trace sort function
     68#define CHECK_RESULT        0               // for debug
     69#define INSTRUMENTATION     1               // register computation times on file
     70
     71///////////////////////////////////////////////////////////////////////////////////
     72//            Arguments for the sort() function
     73///////////////////////////////////////////////////////////////////////////////////
    6974
    7075typedef struct
    7176{
    72     unsigned int threads;       // total number of threads
    73     unsigned int thread_uid;    // thread user index (0 to threads -1)
    74     unsigned int main_uid;      // main thread user index
     77    unsigned int        tid;                // continuous thread index
     78    unsigned int        threads;            // total number of threads
     79    pthread_barrier_t * parent_barrier;     // pointer on termination barrier
    7580}
    76 args_t;
    77 
    78 //////////////////////////////////////////
    79 //      Global variables
    80 //////////////////////////////////////////
     81sort_args_t;
     82
     83////////////////////////////////////////////////////////////////////////////////////
     84//            Sort specific global variables
     85////////////////////////////////////////////////////////////////////////////////////
    8186
    8287int                 array0[ARRAY_LENGTH];    // values to sort
     
    8590pthread_barrier_t   barrier;                 // synchronisation variables
    8691
    87 pthread_t           trdid[MAX_THREADS];      // kernel identifiers
    88 pthread_attr_t      attr[MAX_THREADS];       // thread attributes
    89 args_t              arg[MAX_THREADS];        // sort function arguments
     92/////////////////////////////////////////////////////////////////////////////////////
     93//             Global variables required by parallel_pthread_create()
     94/////////////////////////////////////////////////////////////////////////////////////
     95
     96// 2D arrays of input arguments for the <sort> threads
     97// These arrays are initialised by the application main thread
     98
     99sort_args_t       sort_args[CLUSTERS_MAX][CORES_MAX];  // sort function arguments
     100sort_args_t     * sort_ptrs[CLUSTERS_MAX][CORES_MAX];  // pointers on arguments
     101
     102// 1D array of barriers to allow the <sort> threads to signal termination
     103// this array is initialised by the pthread_parallel_create() function
     104 
     105pthread_barrier_t parent_barriers[CLUSTERS_MAX];       // termination barrier
     106
    90107
    91108////////////////////////////////////
     
    157174}  // end merge()
    158175
    159 //////////////////////////////////////
    160 static void sort( const args_t * ptr )
     176//////////////////////////////
     177void sort( sort_args_t * ptr )
    161178{
    162     unsigned int       i;
    163     unsigned long long cycle;
    164     unsigned int       cxy;
    165     unsigned int       lid;
    166 
    167     int              * src_array  = NULL;
    168     int              * dst_array  = NULL;
    169 
    170     // get core coordinates an date
    171     get_core( &cxy , &lid );
    172     get_cycle( &cycle );
    173 
    174     unsigned int  thread_uid = ptr->thread_uid;
    175     unsigned int  threads    = ptr->threads;
    176     unsigned int  main_uid   = ptr->main_uid;
    177 
    178 #if DISPLAY_ARRAY
    179 unsigned int n;
    180 if( thread_uid == main_uid )
    181 {
    182     printf("\n*** array before sort\n");
    183     for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , array0[n] );
    184 }
     179    unsigned int        i;
     180    int               * src_array  = NULL;
     181    int               * dst_array  = NULL;
     182
     183    // get arguments
     184    unsigned int        tid            = ptr->tid;
     185    unsigned int        threads        = ptr->threads;
     186    pthread_barrier_t * parent_barrier = ptr->parent_barrier;
     187
     188    unsigned int        items      = ARRAY_LENGTH / threads;
     189    unsigned int        stages     = __builtin_ctz( threads ) + 1;
     190
     191#if DEBUG_SORT
     192printf("\n[sort] start : ptr %x / tid %d / threads %d / barrier %x\n",
     193ptr, tid, threads, parent_barrier );
     194#endif
     195
     196    bubbleSort( array0, items, items * tid );
     197
     198#if DEBUG_SORT
     199printf("\n[sort] thread[%d] : stage 0 completed\n", tid );
    185200#endif
    186201
     
    189204
    190205#if DEBUG_SORT
    191 if( thread_uid == 0 )
    192 printf("\n[sort] thread[%d] exit barrier 0\n", thread_uid );
    193 #endif
    194 
    195     unsigned int  items      = ARRAY_LENGTH / threads;
    196     unsigned int  stages     = __builtin_ctz( threads ) + 1;
    197 
    198 #if DEBUG_SORT
    199 if( thread_uid == 0 )
    200 printf("\n[sort] thread[%d] : start\n", thread_uid );
    201 #endif
    202 
    203     bubbleSort( array0, items, items * thread_uid );
    204 
    205 #if DEBUG_SORT
    206 if( thread_uid == 0 )
    207 printf("\n[sort] thread[%d] : stage 0 completed\n", thread_uid );
    208 #endif
    209 
    210     /////////////////////////////////
    211     pthread_barrier_wait( &barrier );
    212 
    213 #if DEBUG_SORT
    214 if( thread_uid == 0 )
    215 printf("\n[sort] thread[%d] exit barrier 0\n", thread_uid );
    216 #endif
    217 
    218 #if DISPLAY_ARRAY
    219 if( thread_uid == main_uid )
    220 {
    221     printf("\n*** array after bubble sort\n");
    222     for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , array0[n] );
    223 }
     206printf("\n[sort] thread[%d] exit barrier 0\n", tid );
    224207#endif
    225208
     
    239222        }
    240223
    241         if( (thread_uid & ((1<<i)-1)) == 0 )
    242         {
    243 
    244 #if DEBUG_SORT
    245 if( thread_uid == 0 )
    246 printf("\n[sort] thread[%d] : stage %d start\n", thread_uid , i );
     224        if( (tid & ((1<<i)-1)) == 0 )
     225        {
     226
     227#if DEBUG_SORT
     228printf("\n[sort] thread[%d] : stage %d start\n", tid , i );
    247229#endif
    248230            merge( src_array,
    249231                   dst_array,
    250232                   items << (i-1),
    251                    items * thread_uid,
    252                    items * (thread_uid + (1 << (i-1))),
    253                    items * thread_uid );
    254 
    255 #if DEBUG_SORT
    256 if( thread_uid == 0 )
    257 printf("\n[sort] thread[%d] : stage %d completed\n", thread_uid , i );
     233                   items * tid,
     234                   items * (tid + (1 << (i-1))),
     235                   items * tid );
     236
     237#if DEBUG_SORT
     238printf("\n[sort] thread[%d] : stage %d completed\n", tid , i );
    258239#endif
    259240        }
     
    263244
    264245#if DEBUG_SORT
    265 if( thread_uid == 0 )
    266 printf("\n[sort] thread[%d] exit barrier %d\n", thread_uid , i );
    267 #endif
    268 
    269 #if DISPLAY_ARRAY
    270 if( thread_uid == main_uid )
    271 {
    272     printf("\n*** array after merge %d\n", i );
    273     for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , dst_array[n] );
    274 }
     246printf("\n[sort] thread[%d] exit barrier %d\n", tid , i );
    275247#endif
    276248
    277249    }  // en for stages
    278250
    279     // all threads but the main thread exit
    280     if( thread_uid != main_uid ) pthread_exit( NULL );
     251    // sort thread signal completion to main thread
     252    pthread_barrier_wait( parent_barrier );
     253
     254#if DEBUG_SORT
     255printf("\n[sort] thread[%d] exit\n", tid );
     256#endif
     257
     258    // sort thread exit
     259    pthread_exit( NULL );
    281260
    282261} // end sort()
     
    291270    unsigned int           ncores;             // number of cores per cluster
    292271    unsigned int           total_threads;      // total number of threads
    293     unsigned int           thread_uid;         // user defined thread index
    294     unsigned int           main_cxy;           // cluster identifier for main
    295     unsigned int           main_x;             // X coordinate for main thread
    296     unsigned int           main_y;             // Y coordinate for main thread
    297     unsigned int           main_lid;           // core local index for main thread
    298     unsigned int           main_uid;           // thread user index for main thread
    299     unsigned int           x;                  // X coordinate for a thread
    300     unsigned int           y;                  // Y coordinate for a thread
     272    unsigned int           x;                  // X coordinate for a sort thread
     273    unsigned int           y;                  // Y coordinate for a sort thread
     274    unsigned int           cxy;                // cluster identifier for a sort thead
    301275    unsigned int           lid;                // core local index for a thread
     276    unsigned int           tid;                // sort thread continuous index
     277    pthread_barrierattr_t  barrier_attr;       // barrier attributes (used for DQT)
    302278    unsigned int           n;                  // index in array to sort
    303     pthread_barrierattr_t  barrier_attr;       // barrier attributes
    304279
    305280    unsigned long long     start_cycle;
     
    314289    total_threads = x_size * y_size * ncores;
    315290
    316     // get core coordinates and user index for the main thread
    317     get_core( &main_cxy , & main_lid );
    318     main_x   = HAL_X_FROM_CXY( main_cxy );
    319     main_y   = HAL_Y_FROM_CXY( main_cxy );
    320     main_uid = (((main_x * y_size) + main_y) * ncores) + main_lid;
     291    // compute covering DQT size an level
     292    unsigned int z = (x_size > y_size) ? x_size : y_size;
     293    unsigned int root_level = (z == 1) ? 0 : (z == 2) ? 1 : (z == 4) ? 2 : (z == 8) ? 3 : 4;
    321294
    322295    // checks number of threads
     
    326299         (total_threads != 512) && (total_threads != 1024) )
    327300    {
    328         printf("\n[sort error] number of cores must be power of 2\n");
     301        printf("\n[sort] ERROR : number of cores must be power of 2\n");
    329302        exit( 0 );
    330303    }
     
    333306    if ( ARRAY_LENGTH % total_threads)
    334307    {
    335         printf("\n[sort error] array size must be multiple of number of threads\n");
     308        printf("\n[sort] ERROR : array size must be multiple of number of threads\n");
    336309        exit( 0 );
    337310    }
     
    355328    if( error )
    356329    {
    357         printf("\n[sort error] cannot initialise barrier\n" );
     330        printf("\n[sort] ERROR : cannot initialise barrier\n" );
    358331        exit( 0 );
    359332    }
     
    370343    }
    371344
     345#if DISPLAY_ARRAY
     346    printf("\n*** array before sort\n");
     347    for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , array0[n] );
     348#endif
     349
    372350#if DEBUG_MAIN
    373351printf("\n[sort] main completes array init\n");
    374352#endif
    375353
    376     // launch other threads to execute sort() function
    377     // on cores other than the core running the main thread
    378     for ( x = 0 ; x < x_size ; x++ )
    379     {
    380         for ( y = 0 ; y < y_size ; y++ )
    381         {
     354    // build array of arguments for the <sort> threads
     355    for (x = 0 ; x < x_size ; x++)
     356    {
     357        for (y = 0 ; y < y_size ; y++)
     358        {
     359            // compute cluster identifier
     360            cxy = HAL_CXY_FROM_XY( x , y );
     361
    382362            for ( lid = 0 ; lid < ncores ; lid++ )
    383363            {
    384                 // compute thread user index (continuous index)
    385                 thread_uid = (((x * y_size) + y) * ncores) + lid;
    386 
    387                 // set arguments for all threads
    388                 arg[thread_uid].threads      = total_threads;
    389                 arg[thread_uid].thread_uid   = thread_uid;
    390                 arg[thread_uid].main_uid     = main_uid;
    391 
    392                 // set thread attributes for all threads
    393                 attr[thread_uid].attributes = PT_ATTR_CLUSTER_DEFINED | PT_ATTR_CORE_DEFINED;
    394                 attr[thread_uid].cxy        = HAL_CXY_FROM_XY( x , y );
    395                 attr[thread_uid].lid        = lid;
    396 
    397                 if( thread_uid != main_uid )
    398                 {
    399                     if ( pthread_create( &trdid[thread_uid],  // buffer for kernel identifier
    400                                          &attr[thread_uid],   // thread attributes
    401                                          &sort,               // entry function
    402                                          &arg[thread_uid] ) ) // sort arguments
    403                     {
    404                         printf("\n[sort error] main cannot create thread %x \n", thread_uid );
    405                         exit( 0 );
    406                     }
    407 
    408 #if (DEBUG_MAIN & 1)
    409 printf("\n[sort] main created thread %x \n", thread_uid );
    410 #endif
    411                 }
     364                // compute thread continuous index
     365                tid = (((x * y_size) + y) * ncores) + lid;
     366
     367                // initialize 2D array of arguments
     368                sort_args[cxy][lid].tid            = tid;
     369                sort_args[cxy][lid].threads        = total_threads;
     370                sort_args[cxy][lid].parent_barrier = &parent_barriers[cxy];
     371
     372                // initialize 2D array of pointers
     373                sort_ptrs[cxy][lid] = &sort_args[cxy][lid];
    412374            }
    413375        }
    414376    }
    415    
     377
    416378    ///////////////////////////
    417379    get_cycle( &seq_end_cycle );
     
    422384#endif
    423385
    424     // the main thread run also the sort() function
    425     sort( &arg[main_uid] );
    426 
    427     // wait other threads completion
    428     for ( x = 0 ; x < x_size ; x++ )
    429     {
    430         for ( y = 0 ; y < y_size ; y++ )
    431         {
    432             for ( lid = 0 ; lid < ncores ; lid++ )
    433             {
    434                 // compute thread continuous index
    435                 thread_uid = (((x * y_size) + y) * ncores) + lid;
    436 
    437                 if( thread_uid != main_uid )
    438                 {
    439                     if( pthread_join( trdid[thread_uid] , NULL ) )
    440                     {
    441                         printf("\n[fft error] in main thread %d joining thread %d\n",
    442                         main_uid , thread_uid );
    443                         exit( 0 );
    444                     }
    445                    
    446 #if (DEBUG_MAIN & 1)
    447 printf("\n[fft] main thread %d joined thread %d\n", main_uid, thread_uid );
    448 #endif
    449 
    450                 }
    451             }
    452         }
     386    // create and execute the working threads
     387    if( pthread_parallel_create( root_level,
     388                                 &sort,
     389                                 &sort_ptrs[0][0],
     390                                 &parent_barriers[0] ) )
     391    {
     392        printf("\n[sort] ERROR : cannot create threads\n");
     393        exit( 0 );
    453394    }
    454395
     
    456397    get_cycle( &para_end_cycle );
    457398
    458     printf("\n[sort] main completes parallel sort at cycle %d\n",
    459     (unsigned int)para_end_cycle );
     399#if DEBUG_main
     400printf("\n[sort] main completes parallel sort at cycle %d\n",
     401(unsigned int)para_end_cycle );
     402#endif
    460403
    461404    // destroy barrier
    462405    pthread_barrier_destroy( &barrier );
     406
     407#if DISPLAY_ARRAY
     408    printf("\n*** array after merge %d\n", i );
     409    for( n=0; n<ARRAY_LENGTH; n++) printf("array[%d] = %d\n", n , dst_array[n] );
     410#endif
    463411
    464412#if CHECK_RESULT
     
    492440    // build file name
    493441    if( USE_DQT_BARRIER )
    494     snprintf( name , 64 , "sort_dqt_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores );
     442    snprintf( name , 64 , "p_sort_dqt_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores );
    495443    else
    496     snprintf( name , 64 , "sort_smp_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores );
     444    snprintf( name , 64 , "p_sort_smp_%d_%d_%d", ARRAY_LENGTH, x_size * y_size, ncores );
    497445
    498446    // build file pathname
     
    515463    if( stream == NULL )
    516464    {
    517         printf("\n[sort error] cannot open instrumentation file <%s>\n", path );
     465        printf("\n[sort] ERROR : cannot open instrumentation file <%s>\n", path );
    518466        exit(0);
    519467    }
     
    532480    if( ret < 0 )
    533481    {
    534         printf("\n[sort error] cannot write to instrumentation file <%s>\n", path );
     482        printf("\n[sort] ERROR : cannot write to instrumentation file <%s>\n", path );
    535483        exit(0);
    536484    }
     
    548496    if( ret )
    549497    {
    550         printf("\n[sort error] cannot close instrumentation file <%s>\n", path );
     498        printf("\n[sort] ERROR : cannot close instrumentation file <%s>\n", path );
    551499        exit(0);
    552500    }
Note: See TracChangeset for help on using the changeset viewer.