/* 
 * st_new.h
 */

/* 
 * Copyright (c) 1999 by Kenjiro Taura, Akinori Yonezawa. All rights reserved.
 * Copyright (c) 1999 by Yoshihiro Oyama, Toshio Endo. All rights reserved.
 * Copyright (c) 1999 by Kunio Tabata. All rights reserved.
 * Copyright (c) 1999 by Mitsubishi Research Institute.  All rights reserved.
 * Copyright (c) 1999 by Information-technology Promotion Agency.  All rights reserved.
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 */

#ifndef __ST_H__
#define __ST_H__

#ifdef __cplusplus
extern "C" {
#endif
  
#define TMP 1
  
/* define ST_DBG to 0 to squeeze speed by not keeping information useful
   for debugging */
#ifndef ST_DBG
#define ST_DBG 1
#endif
  
/* define ST_ASSERT_ON to 0 to skip some runtime checks */
#ifndef ST_ASSERT_ON
#define ST_ASSERT_ON 1
#endif
  
/* define STACK_INV_CHECK to 0 to skip stack consistency checks */
#ifndef STACK_INV_CHECK
#define STACK_INV_CHECK 1
#endif
  
/* define ST_STAT to 0 to remove overhead for statistics  */
#ifndef ST_STAT
#define ST_STAT 1
#endif
  
/* define ST_STAT to 0 to turn off profiler */
#ifndef ST_PROF
#define ST_PROF 1
#endif
  
#include "st_thrcrt.h"
  
/* many places are affected when changing the following parameters */
#define THR_PROC_N_PARAMS 7
#define WORKER_PROC_N_PARAMS (THR_PROC_N_PARAMS - 3)
  
/* when you change this, change asmpp.awk (definition of C_removed_ra in 
   asmpp_main) accordingly */
#define REMOVED_RA ((void*)0)
/* #define REMOVED_RA ((void*)24) */
  
/* this file should be included from C/C++ files that use stackthreads.
   this is also included from a table file (.o.tab.c) generated by asmpp.
   in that case, only definitions of st_fork_point_info and st_proc_info
   are needed. including others are waste of time and, worse, result in
   compile error, because .o.tab.c declare all functions to be void. this
   is no problem, because it never calls them, but may still conflict with 
   the original prototype declarations. to avoid this, when .o.tab.c includes
   this file, it defines PROC_INFO_ONLY. */
  
#ifndef PROC_INFO_ONLY

  /*#include <stdio.h>*/
#define st_errout stdout
/* #define st_errout stderr */
  
/*
   CALLEE_SAVES_N_WORDS: size of an array (in terms of words) that is
   enough to hold all callee-saved registers (except for TLS_REG
   below), assuming the first word is properly aligned for double.

   CS_ASSUMES_DW_ALIGNED: 1 if callee-save registers area must be double-word
   aligned (true on CPUs that use store-/load-double instructions to 
   save/restore registers or CPUs that have floating-point callee-save 
   registers)
   
   POINTER_IS_NOT_DW_ALIGNED: 1 if pointers are not double-word aligned by
   default (true on most 32 bit CPUs)

   TLS_REG_NAME: the name of the register that always points to the thread 
   local storage. it must be a callee-save register.

   MO_R_BYPASS_R : 1 if a read bypases another read
   MO_R_BYPASS_W : 1 if a read bypases a write
   MO_W_BYPASS_R : 1 if a write bypases a read
   MO_W_BYPASS_W : 1 if a write bypases another write
   HAVE_CMP_AND_SWAP: 1 if the CPU has compare_and_swap instruction.
   HAVE_SWAP: 1 if the CPU has swap instruction.
   HAVE_LL_SC: 1 if the CPU has load-linked/store-conditional instruction.

   HAVE_GP: 1 if the CPU has a global register.
   STACK_GROWS_HIGHER_ADDRESS: 1 if the stack grows towards higher addresses.
*/
  
/* i386 configurations */
#if defined(i386)
  
/* esi and ebx (see $GCC/config/i386/i386.h) */
#define CALLEE_SAVES_N_WORDS 2	
#define CS_ASSUMES_DW_ALIGNED 0
#define POINTER_IS_NOT_DW_ALIGNED 1
#define STACK_ALIGN_RAISE(x) (((x) + 7) & ~7)
#define STACK_ALIGN_DROP(x) ((x) & ~7)
  
/* if you change this, make sure you change 
   options_i386 and asmpp.awk also */
#define TLS_REG_NAME ("ebx")	
#define MO_R_BYPASS_R 0
#define MO_R_BYPASS_W 0
#define MO_W_BYPASS_R 0
#define MO_W_BYPASS_W 0
#define HAVE_CMP_AND_SWAP 1
#define HAVE_SWAP 1
#define HAVE_LL_SC 0
     
#define PTR_BYTES 4
#define INT_BYTES 4
#define LONG_BYTES 4
#define USE_READ_AND_SET_LSB8 0
#define USE_READ_AND_SET_LSB4 1
#define HAVE_ASM_READ_AND_SET_LSB8 0
#define HAVE_ASM_READ_AND_SET_LSB4 1

#define HAVE_GP 0
#define STACK_GROWS_HIGHER_ADDRESS 0
     
/* MIPS configurations */
#elif defined(mips)
     

/* 6 double floats (20 -- 30)
   7 integers (16 -- 22) 
   (floats come first) */
#define CALLEE_SAVES_N_WORDS 19	
#define CS_ASSUMES_DW_ALIGNED 1
#define POINTER_IS_NOT_DW_ALIGNED 1
#define STACK_ALIGN_RAISE(x) (((x) + 7) & ~7)
#define STACK_ALIGN_DROP(x) ((x) & ~7)
#define TLS_REG_NAME ("$23")
     
#define MO_R_BYPASS_R 0
#define MO_R_BYPASS_W 0
#define MO_W_BYPASS_R 0
#define MO_W_BYPASS_W 0
#define HAVE_CMP_AND_SWAP 0
#define HAVE_SWAP 0
#define HAVE_LL_SC 1
     
#define PTR_BYTES 4
#define INT_BYTES 4
#define LONG_BYTES 4
#define USE_READ_AND_SET_LSB8 1
#define USE_READ_AND_SET_LSB4 1
#define HAVE_ASM_READ_AND_SET_LSB8 1
#define HAVE_ASM_READ_AND_SET_LSB4 1
     
#define HAVE_GP 1
#define STACK_GROWS_HIGHER_ADDRESS 0
     
/* SPARC configurations */
#elif defined(__sparc)

#define CALLEE_SAVES_N_WORDS 13
     
#define CS_ASSUMES_DW_ALIGNED 1
#define POINTER_IS_NOT_DW_ALIGNED 1
#define STACK_ALIGN_RAISE(x) (((x) + 7) & ~7)
#define STACK_ALIGN_DROP(x) ((x) & ~7)

#define TLS_REG_NAME ("%l7")

#define MO_R_BYPASS_R 1
#define MO_R_BYPASS_W 1
#define MO_W_BYPASS_R 0
#define MO_W_BYPASS_W 0
#define HAVE_CMP_AND_SWAP 1
#define HAVE_SWAP 1
#define HAVE_LL_SC 0
     
#define PTR_BYTES 4
#define INT_BYTES 4
#define LONG_BYTES 4
#define USE_READ_AND_SET_LSB8 0
#define USE_READ_AND_SET_LSB4 1
#define HAVE_ASM_READ_AND_SET_LSB8 0
#define HAVE_ASM_READ_AND_SET_LSB4 1
     
#define HAVE_GP 0
#define STACK_GROWS_HIGHER_ADDRESS 0
     
#elif defined(__alpha)
     
/* 8 double floats ($f2 -- $f9)
   5 integers ($9 -- $13) */
#define CALLEE_SAVES_N_WORDS 13	
				   
#define CS_ASSUMES_DW_ALIGNED 1
#define POINTER_IS_NOT_DW_ALIGNED 0
#define STACK_ALIGN_RAISE(x) (((x) + 7) & ~7)
#define STACK_ALIGN_DROP(x) ((x) & ~7)
#define MO_R_BYPASS_R 1
#define MO_R_BYPASS_W 1
#define MO_W_BYPASS_R 1
#define MO_W_BYPASS_W 1
     
#define TLS_REG_NAME ("$14")
#define HAVE_CMP_AND_SWAP 0
#define HAVE_SWAP 0
#define HAVE_LL_SC 1

#define PTR_BYTES 8
#define INT_BYTES 4
#define LONG_BYTES 8
#define USE_READ_AND_SET_LSB8 1
#define USE_READ_AND_SET_LSB4 1
#define HAVE_ASM_READ_AND_SET_LSB8 1
#define HAVE_ASM_READ_AND_SET_LSB4 1
				   
#define HAVE_GP 1
#define STACK_GROWS_HIGHER_ADDRESS 0
				   
#else
#error "unknown CPU"
#endif 
/* defined(i386)||defined(mips)||defined(__sparc)||defined(__alpha) */
#endif
/* !defined(PROC_INFO_ONLY) */				   

				   
/* do not change this we assume the value is -1 in some places */
#define ST_LOCKED -1
#define ST_FAULT -1
#define ST_BUSY -2
     
/* macros SWAP_LONG/CMP_AND_SWAP_LONG.
   result in compilation error if they 
   are used where they are unavailable */
#if HAVE_SWAP
long asm_swap_long(long volatile *, long);
#define SWAP_LONG(l, v) asm_swap_long(l, v)
#else  /* HAVE_SWAP */
#define SWAP_LONG(l, v) ....
#endif /* HAVE_SWAP */
				   
#if HAVE_CMP_AND_SWAP
long asm_cmp_and_swap_long(long volatile *, long, long);
#define CMP_AND_SWAP_LONG(l, v, new_v) asm_cmp_and_swap_long(l, v, new_v)
#elif HAVE_LL_SC
long asm_cmp_and_swap_long_by_ll_sc(long volatile *, long, long);
#define CMP_AND_SWAP_LONG(l, v, new_v) asm_cmp_and_swap_long_by_ll_sc(l, v, new_v)
#else  /* !HAVE_CMP_AND_SWAP && !HAVE_LL_SC */
#define CMP_AND_SWAP_LONG(l, v, new_v) ....
#endif /* HAVE_CMP_AND_SWAP || HAVE_LL_SC */


/* macros for addresses in stack */
#if STACK_GROWS_HIGHER_ADDRESS
#define GROW_STACK(orig, offs) ((orig) + (offs))
#define GROW_STACK2(orig, offs1, offs2) ((orig) + (offs1) + (offs2))
#define SHRINK_STACK(orig, offs) ((orig) - (offs))
#define SP_LT <
#define SP_GT >
#define SP_LEQ <=
#define SP_GEQ >=
#define SP_DIFF(t,b) ((t) - (b))
#define BOTTOM_OF_THE_WORLD ((void*)0)
#else
#define GROW_STACK(orig, offs) ((orig) - (offs))
#define GROW_STACK2(orig, offs1, offs2) ((orig) - (offs1) - (offs2))
#define SHRINK_STACK(orig, offs) ((orig) + (offs))
#define SP_LT >
#define SP_GT <
#define SP_LEQ >=
#define SP_GEQ <=
#define SP_DIFF(t,b) ((b) - (t))
#define BOTTOM_OF_THE_WORLD ((void*)ULONG_MAX)
#endif

#ifdef WITH_SGC
#if STACK_GROWS_HIGHER_ADDRESS
#define SGC_CLEAR_STACK_N(from, to) \
do { \
    char * sp = (char *)asm_get_sp(); \
    int i; \
    for (i = from; i <= to; i++) { \
	sp[i] = 0; \
    } \
} while(0)
#else  /* STACK_GROWS_HIGHER_ADDRESS */
#define SGC_CLEAR_STACK_N(from, to) \
do { \
    char * sp = (char *)asm_get_sp(); \
    int i; \
    for (i = from; i <= to; i++) { \
	sp[-i] = 0; \
    } \
} while(0)
#endif /* STACK_GROWS_HIGHER_ADDRESS */
#else  /* WITH_SGC */
#define SGC_CLEAR_STACK_N(form, to) do {} while(0)
#endif /* WITH_SGC */

/* note: dont do SGC_CLEAR_STACK_N(0, XX). the first parameter must be
   positive (>= 1) */
#define SGC_CLEAR_STACK() SGC_CLEAR_STACK_N(1, 4096) 


/* ------------------------------------------------------------------------ */

typedef unsigned long uslong;
#include <st_foreign.h>

/* 
 * data structure for fork points
 */

/* a fork point is simply represented by a pair of addresses.  if a
   call site (the address of a call instruction) is greater than or
   equal to BEGIN and less than END, then it is a fork point. */
typedef struct st_fork_point_info
{
  uslong begin, end;
} * st_fork_point_info_t;

/* 
 * data structure for call sites of restart_context.
 */

/* if a call site equals to ADDR, then it is a call site of restart_context */
typedef struct st_invalid_call_site_info
{
  uslong begin, end;
} * st_invalid_call_site_info_t;

/* 
 * data structure for procedures
 */
typedef struct st_proc_info
{
  /* address of the first instruction of this procedure*/
  uslong begin;			
  /* epilogue code address (serves as the end point of a procedure) */
  uslong real_epilogue;
  /* a pointer to the sequence that restores callee-save registers */
  uslong pure_epilogue;

  /* offset of the return address relative to FP (i.e., 
     FP + return_address_offset holds return address */
  long return_address_offset;	

  /* offset of the parent FP relative to the current FP (i.e.,
     FP + parent_fp_offset holds parent FP */
  long parent_fp_offset;

  /* displacement of the return destination. i.e., if a procedure
     that is given R as its return address eventually jumps to R + X, 
     then put X as the RETURN_DISPLACEMENT.

     on some calling conventions (such as i386), the caller passes the
     next address of the call instruction and the callee simply jumps
     to the given address (for example, i386's call instruction pushs
     $PC + 5 on the stack and ret instruction pops an item from the
     stack and simply jumps there). in this case, RETURN_DISPLACEMENT
     is simply zero.

     on other calling conventions (such as sparc), the caller just
     passes the address of the call instruction and the caleee jumps
     to the given address + an offset. an offset is eight for
     procedures that return a scalar (int and friends, floats, doubles,
     and pointers) and twelve for procedures that return structures. */
  short return_displacement;

  /* size of the SP-relative portion of the stack */
  short sp_relative_size;

  /* maximum of X s.t., an instruction that shrinks the stack by X appears 
     in this procedure */
  short max_sp_shrink_size;

  /* the value of SP - FP just after prologue */
  long base_sp_minus_fp;

  /* a pointer to fork point information */
  st_fork_point_info_t fork_point_info;

  /* its size */
  int n_fork_points;

  /* a pointer to information about where restart_context is called */
  st_invalid_call_site_info_t invalid_call_site_info;

  /* its size */
  int n_invalid_call_sites;

  /* next field in the table */
  struct st_proc_info * next;

  /* same as begin, except for printed as hexadecimal in debuggers */
  void (*begin_)();
  /* epilogue code address */
  void (*real_epilogue_)();
  /* pure epilogue code address */
  void (*epilogue_)();
  /* function's name (useful only for debugging) */
  char * name;
} * st_proc_info_t;

/* another table that lists all data symbols. an element pairs an address
   and its name (string). each module (.o) has a global table called
   MODULE_NAME_all_global_data_labels and the linker (stlink.awk) collects
   them in a single table. the initinialier forms a large, flat, sorted table
   <address, string> tuples.

   this mechanism is not necessary for StackThreads/MP per se. it is
   used for MTCAMP race detector for now. pretty much the same thing
   could be implemented in frontend, without bothering
   postprocessor. we implement it in the postprocessor because it
   might be useful for other purposes in future, and an implementation 
   is simpler. */

typedef struct st_data_info
{
  uslong addr;
  uslong size;
  char * name;
} * st_data_info_t;

#ifndef PROC_INFO_ONLY

/* data structure for keeping track of invalid frames.

   if a procedure calls restart_context to link suspended context on
   top of the stack, the bottom frame of the context is linked to the
   (the parent of)^n restart_context. we say the frame to which the bottom
   frame is linked invalid, meaning that when control reaches the frame
   again, callee-save registers will hold invalid values.

   if a frame is invalid, we must have a data structure to preserve
   callee-save registers for that frame. 

   all descriptors are linked from top to bottom (the head is avaiable via
   tls(fixed_invalid_frames). */

typedef struct invalid_frame_desc
{
  uslong fixed_cs_regs[CALLEE_SAVES_N_WORDS]; /* saved registers */
  void * ra;			/* return address */
  void * fp;			/* frame pointer */
  struct invalid_frame_desc * next; /* link to the next invalid frame */
} * invalid_frame_desc_t;

#define ASSERT_INVALID_FRAME_DESC_OFFSETS() \
assert((uslong)(&((invalid_frame_desc_t)0)->fixed_cs_regs) == 0)
     

/* context:

   a context represents a linked list of C stack frames. more specifically,
   it specifies the topmost frame and the bottom-most frame.
   the topmost frame is physically represented by (1) the program
   counter to start execution from, (2) FP of the topmost frame, and
   (3) callee-save registers to restore upon execution. the bottom-
   most frame is physically represented by (1) the address within the
   bottom-most frame at which its parent's FP is stored, (2) the
   address within the bottom-most frame at which its return address is
   stored.

   to restart the execution of a context, we restore callee-save
   registers and FP, and jump to PC. this is not quite perfect,
   though. in addition to starting the execution of the context, we
   must link the context to the current execution, so that the control
   returns to the caller (the procedure that resumes the context)
   after the execution of the context was finished. we have locations
   at which return address location and parent's FP are stored for
   this purpose.

 */

#if CS_ASSUMES_DW_ALIGNED
#define CS_ALIGN_REQUEST __attribute__ ((aligned (8)))
#else
#define CS_ALIGN_REQUEST 
#endif  

#if CS_ASSUMES_DW_ALIGNED && POINTER_IS_NOT_DW_ALIGNED
#define CS_REGS_PAD 1
#else
#define CS_REGS_PAD 0
#endif

typedef struct st_context
{
  /* certain offsets for these fields are assumed in capture_context.*/
  void * fixed_sp;		/* SP of the top frame
				   (this is not restored when the context
				   is resumed)
				   assume offset is 0 */
  void * fixed_fp;		/* FP of the top frame.
				   assume offset is sizeof(void*) */
  void * fixed_pc;		/* program counter to resume execution from.
				   assume offset is 2 * sizeof(void*) */

  /* callee save registers : assume offset is 3 or 4 * sizeof(void*) */
#pragma gccext
  uslong fixed_cs_regs[CALLEE_SAVES_N_WORDS] GCC_EXT(CS_ALIGN_REQUEST);

  /* chain of invalid frames this context holds */
  invalid_frame_desc_t invalid_frames_top;
  invalid_frame_desc_t invalid_frames_bottom;

  void * bottom_fp;		/* FP of the bottom frame */
  void ** rap;			/* return address location of the
				   bottom frame */
  void ** pfpp;			/* parent FP location of the bottom frame */
  short return_displacement;	/* see the comment in proc_info_t */
  short volatile valid;		/* initially 0.
				   becomes 1 when C's fields are completely
				   initialized and thus it becomes ready to
				   be rescheduled */
  short n_threads;		/* how many threads does this context
				   represent */
  struct st_context *q_next;	/* a link to the next (towards bottom) 
				   context in resumed contexts stack */
  struct st_context *q_prev;	/* a link to the prev (towards top) context 
				   in resumed contexts stack */ 
#if ST_DBG
  char * bottom_proc_name;	/* the name of the procedure of the 
				   bottom frame (for debuggers sake) */
#else  /* ST_DBG */
  char * bottom_proc_name_;
#endif /* ST_DBG */
} * st_context_t;

typedef struct st_context_list
{
  struct st_context_list * next;
  st_context_t c;
} * st_context_list_t;

/* data structure for spin-locked locations */
typedef struct st_int_loc
{
  int v;
} st_int_loc_t;
#define ST_INT_LOC_ATTATCH_TAG(x) ((x) << 2)
#define ST_INT_LOC_STRIP_TAG(x) ((x) >> 2)
#define ST_INT_LOC_INITIALIZER(x) { ST_INT_LOC_ATTATCH_TAG(x) }
#define ST_INT_LOC_INIT(l, x) ((l)->v = ST_INT_LOC_ATTATCH_TAG(x))
#define ST_INT_LOC_CHECK(l, x) ((l)->v == ST_INT_LOC_ATTATCH_TAG(x))
#define ST_INT_LOC_LOCKED(l) ((l)->v & 3)

typedef struct st_long_loc
{
  long v;
} st_long_loc_t;
#define ST_LONG_LOC_ATTATCH_TAG(x) ((x) << 2)
#define ST_LONG_LOC_STRIP_TAG(x) ((x) >> 2)
#define ST_LONG_LOC_INITIALIZER(x) { ST_LONG_LOC_ATTATCH_TAG(x) }
#define ST_LONG_LOC_INIT(l, x) ((l)->v = ST_LONG_LOC_ATTATCH_TAG(x))
#define ST_LONG_LOC_CHECK(l, x) ((l)->v == ST_LONG_LOC_ATTATCH_TAG(x))
#define ST_LONG_LOC_LOCKED(l) ((l)->v & 3)

typedef struct st_short_loc
{
  short v;
} st_short_loc_t;
#define ST_SHORT_LOC_ATTATCH_TAG(x) ((x) << 2)
#define ST_SHORT_LOC_STRIP_TAG(x) ((x) >> 2)
#define ST_SHORT_LOC_INITIALIZER(x) { ST_SHORT_LOC_ATTATCH_TAG(x) }
#define ST_SHORT_LOC_INIT(l, x) ((l)->v = ST_SHORT_LOC_ATTATCH_TAG(x))
#define ST_SHORT_LOC_CHECK(l, x) ((l)->v == ST_SHORT_LOC_ATTATCH_TAG(x))
#define ST_SHORT_LOC_LOCKED(l) ((l)->v & 3)

typedef struct st_char_loc
{
  char v;
} st_char_loc_t;
#define ST_CHAR_LOC_ATTATCH_TAG(x) ((x) << 2)
#define ST_CHAR_LOC_STRIP_TAG(x) ((x) >> 2)
#define ST_CHAR_LOC_INITIALIZER(x) { ST_CHAR_LOC_ATTATCH_TAG(x) }
#define ST_CHAR_LOC_INIT(l, x) ((l)->v = ST_CHAR_LOC_ATTATCH_TAG(x))
#define ST_CHAR_LOC_CHECK(l, x) ((l)->v == ST_CHAR_LOC_ATTATCH_TAG(x))
#define ST_CHAR_LOC_LOCKED(l) ((l)->v & 3)

typedef struct st_ptr_loc
{
  void * v;
} st_ptr_loc_t;
#define ST_PTR_LOC_ATTATCH_TAG(x) ((void *)(((long)(x) >> 2) << 2))
#define ST_PTR_LOC_STRIP_TAG(x) ((void *)(((long)(x) >> 2) << 2))
#define ST_PTR_LOC_INITIALIZER(x) { ST_PTR_LOC_ATTATCH_TAG(x) }
#define ST_PTR_LOC_INIT(l, x) ((l)->v = ST_PTR_LOC_ATTATCH_TAG(x))
#define ST_PTR_LOC_CHECK(l, x) ((l)->v == ST_PTR_LOC_ATTATCH_TAG(x))
#define ST_PTR_LOC_LOCKED(l) ((long)((l)->v) & 3)

/* data structure for synchronization */

/* a join counter.
   it maintains an integer (counter) and supports a synchronization 
   that waits for the counter to become zero.
   the value of a counter can be set at initialization and can be
   added later (by st_join_counter_spawn).
   however, it is invalid to perform a spawn operation after a 
   wait operation has been called. */

typedef struct st_join_counter
{
  st_int_loc_t count;
  st_context_list_t waiters;
#if ST_DBG
  int wait_called;		/* 1 if wait has been called */
#else
  int _wait_called;		/* to make it binary compatible */
#endif
} st_join_counter_t, * st_join_counter_p;

#define ST_JOIN_COUNTER_INITIALIZER(c) \
{ /* count = */c, /* waiters = */0, /* wait_called = */0 }

/* semaphore */

typedef struct st_sema
{
  st_int_loc_t lock;
  int count;
  /* signal an error if count > pos_limit > 0.
     do not signal an error when pos_limit == 0 */
  int pos_limit;	       
  /* count given at initialization time */
  int initial_count;
  st_context_list_t waiters_head;
  st_context_list_t waiters_tail;
} st_sema_t, * st_sema_p;

#define ST_SEMA_INITIALIZER_1(c, p) \
{ /* lock = */0, /* count = */c, /* pos_limit = */p, /* initial_count = */c, \
		/* waiters_head = */0, /* waiters_tail = */0 }

/* mutex */

typedef struct st_mutex
{
  st_sema_t s[1];
} st_mutex_t, * st_mutex_p;

#define ST_MUTEX_INITIALIZER { ST_SEMA_INITIALIZER_1(1, 1) }

/* condition variable (thanks to Yamamoto) */

typedef struct st_cond
{
  st_int_loc_t lock;
  st_context_list_t waiters_head;
  st_context_list_t waiters_tail;
} st_cond_t, * st_cond_p;

#define ST_COND_INITIALIZER \
{ /* lock = */0, /* waiters_head = */0, /* waiters_tail = */0 }

/* assert our guess of fields' offsets are correct */
#if CS_REGS_PAD
#define CS_REGS_OFFS_W 4
#else
#define CS_REGS_OFFS_W 3
#endif

#define ASSERT_ST_CONTEXT_OFFSETS() \
assert(((uslong)(&((st_context_t)0)->fixed_sp) == 0) \
    && ((uslong)(&((st_context_t)0)->fixed_fp) == sizeof(void*)) \
    && ((uslong)(&((st_context_t)0)->fixed_pc) == sizeof(void*) * 2) \
    && ((uslong)(&((st_context_t)0)->fixed_cs_regs) == sizeof(void*) * CS_REGS_OFFS_W))

/* data structure for keeping track of exported frames. a frame is
   exported when it is unwound, or its children is nearer to the
   bottom. each processor simply retains SP above all exported frames
   and periodically remove finished frames */

typedef struct export_frame_record
{
  void * fp;			/* the frame pointer */
  void ** rap;			/* where return address is stored */
  long base_sp_minus_fp;	/* SP - FP just after prologue */
#if ST_DBG			/* fields for debugging */
  void ** pfpp;			/* where parent FP is stored */
  char * name;			/* its name */
#else  /* ST_DBG */
  void ** pfpp_;
  char * name_;
#endif /* ST_DBG */
} * export_frame_record_t;

/* data structure for passing tasks between workers

   here is the structure for task steal message.
   simply give the victim the empty REPLY field and
   the victim fills it by a pointer to the context of the stolen thread.

   ctxt may or may not be used by the victim to store context of the
   stolen thread. when the victim gives a thread from its C stack, the
   victim uses req->ctxt (to avoid allocating context on its own
   stack) and req->reply == req->ctxt after the task is given. 
   when the victim simply gives a resumed thread, req->ctxt is not
   used and req->reply == the pointer to the resumed thread. */

/* one day, the idea that the requester gives the victim a buffer to 
   be filled by the stolen context. this did not work, because the context
   will enter the captured list of the victim. */

typedef struct task_steal_request
{
  st_context_t reply;		/* reply to task steal request (filled by 
				   the victim) */
} * task_steal_request_t;

typedef struct worker_msg_generic
{
  void (*f)(void *);
  void * a;
} * worker_msg_generic_t;

enum worker_msg_kind {
  wmk_task_steal_request, 
  wmk_generic,
  wmk_last
};

typedef struct worker_msg
{
  enum worker_msg_kind k;
#if ST_DBG
  long sender_thread_id;
  long sender_worker_id;
#else  /* ST_DBG */
  long sender_thread_id_;
  long sender_worker_id_;
#endif /* ST_DBG */
  union {
    struct task_steal_request tsr;
    struct worker_msg_generic wmg;
  } m;
} * worker_msg_t;

/* we maintain a (doubly-linked) list of available workers.  each cell
   in the list is the following structure and represents a
   worker. fill REQUEST slot of a cell by a pointer to
   task_steal_request to request a task to that worker */

typedef struct workers_list
{
  st_int_loc_t lock;
  worker_msg_t msg;
  struct worker_group * wg;	/* the worker group it belongs to (definition 
				   of worker group is in st_int.h) */
  struct thread_local_storage * worker_tls;
  struct workers_list * prev;	/* links in the doubly-linked list */
  struct workers_list * next;
  long thread_id;		/* thread id of the owner of this cell */
  long worker_id;		/* worker id of the owner of this cell */
} * workers_list_t;

/* data structure that has global options (mainly given in the command
   line) */

typedef struct st_global_options
{
  long n_workers;		/* number of toplevel workers */
  char * stack_size;		/* stack size (a string like 256k, 256K, 
				   1m, 1M) */
  long steal_wait_limit;	/* how many iterations to wait for the
				   reply from the victim */
  long steal_wait_warn_limit;	/* issues warning if we do not get reply
				   after this number of iterations 
				   AFTER one recognizes that the request has
				   been picked up by the victim */
  int print_toplevel_worker_stat; /* 1 if we should print thread stat */
  int time_profile;		/* 1 if we profile toplevel worker group */
  char * time_profile_filename;	/*  filename of the profile */
  long time_profile_resolution;	/* resolution of the time profile */
  long time_profile_buffer_size; /* in-memory buffer size of the time 
				    profile */
  long stack_unwind_optimization;/* stack unwind optimization level 
				  (Tau: added a suitable default to 
				  st.c:set_default_global_options) */
} * st_global_options_t;

/* here is default values provieded by the StackThreads runtime
   (see also the table of default values in ma.c).
   they must be given as strings */
#define ST_GOPT_DEFAULT_N_WORKERS "1"
#define ST_GOPT_DEFAULT_STACK_SIZE "0"
#define ST_GOPT_DEFAULT_STEAL_WAIT_LIMIT "40000"
#define ST_GOPT_DEFAULT_STEAL_WAIT_WARN_LIMIT "1000000"
#define ST_GOPT_DEFAULT_PRINT_TOPLEVEL_WORKER_STAT "0"
#define ST_GOPT_DEFAULT_TIME_PROFILE "0"
#define ST_GOPT_DEFAULT_TIME_PROFILE_FILENAME "00stprof"
#define ST_GOPT_DEFAULT_TIME_PROFILE_RESOLUTION "100"
#define ST_GOPT_DEFAULT_TIME_PROFILE_BUFFER_SIZE "8100"
#define ST_GOPT_DEFAULT_STACK_UNWIND_OPTIMIZATION "0"


/* 
 * thread statistics
 */

typedef struct thread_stat
{
  long n_forks;			/* number of forks in this worker */
  long n_blocks;		/* number of blocks in this worker */
  long n_steals;		/* number of steals this worker served */
  long n_steal_timeouts;	/* number of steal requests timeout */
  long n_steal_requests_to_idle; /* number of steal requests sent to idle PE */
} * thread_stat_t;

/* 
 * runtime data structure that is private to each thread 
 */

#define USER_TLS_SIZE sizeof(void*)
union user_tls_area { double __double; uslong __uslong; } ;

typedef struct thread_local_storage
{
  /* a frame below this must be retained even if it is finished */
  void * _fixed_watermark;
  /* invalid frames in the stack is kept in the list invalid_frames. the top
     cell is used in asm_fix_callee_saves */
  invalid_frame_desc_t _fixed_invalid_frames;
  /* maximum of X s.t. addl X,SP appears to cleanup arguments.
     0 on CPUs that never do that. */
  long _fixed_max_sp_shrink_size;
  export_frame_record_t _exported_frames; /* heap of exported frames */
  int _n_exported_frames;	/* # of elements in _exported_frames */
  int _max_n_exported_frames;	/* max # of elements in _exported_frames */
  void * _dummy_exported_frame_rap[1]; /* a dummy storage which the sentinel
				      refers to via ->rap. */
  /* a dummy storage which the sentinel refers to via ->pfpp. */
#if ST_DBG
  void * _dummy_exported_frame_pfpp[1]; 
#else
  void * _dummy_exported_frame_pfpp_[1]; 
#endif

  void * _stack_bottom;		/* stack bottom */
  long _thread_id;		/* global thread id */

  int _in_handler;		/* 1 if the worker is in a handler */

  /* someday, data related to workers should move to a 
     separete structure */
  /* task stealing stuff */
  int _n_total_threads;		/* number of threads in this worker */
  int _n_resumed_threads;	/* number of threads in this workers resumed
				   contexts stack (below) */
  st_context_t _resumed_contexts_top; /* the top of resumed contexts stack */
  st_context_t _resumed_contexts_bottom; /* the bottom */

  int _you_should_be_stolen;	/* 1 if this thread should be stolen */
  int _thread_blocked;		/* 1 if a thread returns to the caller
				   by blocking (any reason other than finish) 
				   */
  st_context_t _task_steal_context; /* the context of the thread that 
				       caught work stealing request */
  task_steal_request_t _task_steal_request; /* just picked up task steal 
					       request */

  /* this cell is inserted to the list of available workers.
     the request field is used to accept a task steal request from idle
     processors */
  struct workers_list _worker_cell[1];
  struct worker_group * _wg;	/* the worker group this thread belongs to */

  /* number of slaves this worker has created */
  int _n_child_workers;
  /* thread IDs of slaves this worker has created */
  struct st_thr_id * _child_workers;
  /* number of entries in _child_workers */
  int _max_child_workers;
  
  /* random generator state */
  int _ws_rand_last;
  int _ws_randx;

  /* ID within the worker group */
  long _worker_id;

  /* pointer to global options */
  st_global_options_t _gopts;

  /* thread statistics */
  struct thread_stat _thread_stat;

  /* profiling info */
  struct profile * _profile;

  /* smalloc info */
#define INTERNAL_MALLOC_FREELISTS 9
#ifdef USE_SMALLOC
  void *_pool_begin;
  void *_pool_end;
  struct free_info *_tl_free_list[INTERNAL_MALLOC_FREELISTS];
#else  /* USE_SMALLOC */
  /* have data structure to keep the size of this structure the same,
     regardless whether we use SMALLOC or not */
  void *_pool_begin_;
  void *_pool_end_;
  struct free_info *_tl_free_list_[INTERNAL_MALLOC_FREELISTS];
#endif
  
  /* allocate a chunk of maximally aligned area for user's purpose */
  union user_tls_area 
    _user[(USER_TLS_SIZE+sizeof(union user_tls_area)-1)/sizeof(union user_tls_area)];
} * thread_local_storage_t;

/* assert our guess of fields' offsets are correct */
#define ASSERT_THREAD_LOCAL_STORAGE_OFFSETS() \
assert(((uslong)(&((thread_local_storage_t)0)->_fixed_watermark)) == 0 \
    && ((uslong)(&((thread_local_storage_t)0)->_fixed_invalid_frames)) == sizeof(void*) \
    && ((uslong)(&((thread_local_storage_t)0)->_fixed_max_sp_shrink_size)) == 2 * sizeof(void*))


/* pointer to the thread local storage is kept in a register */
#pragma gccext
GCC_EXT(register) thread_local_storage_t __tls GCC_EXT(asm) GCC_EXT(TLS_REG_NAME);

typedef enum tss_init_state
{
  tss_init_state_uninited, 
  tss_init_state_initializing, 
  tss_init_state_inited
} tss_init_state_t;

/* a flag that indicates tss has been initialized.
   actually of type tss_init_state_t (declared as long because it is locked).
   these two things are shared by all threads */

extern st_int_loc_t __tss_init_state;
void tss_init_error(void);

typedef struct st_ticket_lock
{
  st_int_loc_t no;
  st_int_loc_t serving;
} st_ticket_lock_t;
#define ST_TICKET_LOCK_INITIALIZER \
{ ST_INT_LOC_INITIALIZER(0), ST_INT_LOC_INITIALIZER(0) }

/* prototypes of public functions */

/* st.c */
/* the size of the stack being used = | SP - stack_bottom | 
   (not the size of the stack) */
long st_stack_used_bytes(void);
/* show stack trace */
void st_stack_trace(void);
/* show the contents of a context */
void st_show_context(st_context_t);
/* show exported frames */
void st_show_exported_frames(void);
void st_app_exit(int /* code */);
void st_app_die(int /* code */);
/* #define st_stack_trace_and_die() st_app_die(1) */

/* add procedure information table */
void st_add_procedure_information_table(st_proc_info_t *);
/* lookup procedure information */
st_proc_info_t st_get_proc_info_full(void *, int zero_if_not_found,
				     st_proc_info_t);
/* lookup data information */
st_data_info_t st_get_data_info(void *);

/* number of live exported frames */
int st_n_live_exported_frames(void);

/* suspend topmost N threads on the stack and save context in C */
void st_suspend_thread_n(st_context_t /* c */, int /* n */);
/* suspend topmost thread on the stack and save context in C */
#define st_suspend_thread(c) st_suspend_thread_n(c, 1)
/* make thread represented by C schedulable */
void st_resume_context(st_context_t /* c */);
/* temporarily suspend the current thread */
void st_yield(void);

/* st_lock.c */

/* int */
int st_read_int(st_int_loc_t *);
int st_read_and_lock_int(st_int_loc_t *);
int st_read_and_lock_any_int(st_int_loc_t *, 
			     int /* el_size */, int /* n */, int * /* res */);
int st_try_read_and_lock_int(st_int_loc_t *, int *);
void st_write_and_unlock_int(st_int_loc_t *, int);
int st_fetch_and_add_int(st_int_loc_t *, int);

/* long */
long st_read_long(st_long_loc_t *);
long st_read_and_lock_long(st_long_loc_t *);
int st_read_and_lock_any_long(st_long_loc_t *, 
			      int /* el_size */, int /* n */, long * /* res */);
int st_try_read_and_lock_long(st_long_loc_t *, long * /* res */);
void st_write_and_unlock_long(st_long_loc_t *, long);
int st_fetch_and_add_long(st_long_loc_t *, long);

/* pointer */
void * st_read_ptr(st_ptr_loc_t *);
void * st_read_and_lock_ptr(st_ptr_loc_t *);
int st_read_and_lock_any_ptr(st_ptr_loc_t *, 
			     int /* el_size */, int /* n */, void ** /* res */);
int st_try_read_and_lock_ptr(st_ptr_loc_t *, void ** /* res */);
void st_write_and_unlock_ptr(st_ptr_loc_t *, void *);

/* st_sync.c */

/* join counter */
int st_join_counter_init(st_join_counter_t *, int);
int st_join_counter_wait(st_join_counter_t *);
int st_join_counter_finish_1(st_join_counter_t *, int);
#define st_join_counter_finish(j) st_join_counter_finish_1(j, 1)
int st_join_counter_spawn_1(st_join_counter_t *, int);
#define st_join_counter_spawn(j) st_join_counter_spawn_1(j, 1) 
int st_join_counter_destroy(st_join_counter_t *);

/* semaphore */
int st_sema_init_1(st_sema_t *, int, int);
#define st_sema_init(sem, c) st_sema_init_1(sem, c, 0)
int st_sema_wait(st_sema_t *);
int st_sema_trywait(st_sema_t *);
int st_sema_post(st_sema_t *);
int st_sema_destroy(st_sema_t *);

/* mutex */
int st_mutex_init(st_mutex_t *);
int st_mutex_trylock(st_mutex_t *);
int st_mutex_lock(st_mutex_t *);
int st_mutex_unlock(st_mutex_t *);
int st_mutex_destroy(st_mutex_t *);

/* condition variable (cond_timedwait not implemented yet) */
int st_cond_init(st_cond_t *);
int st_cond_wait(st_cond_t *, st_mutex_t *);
int st_cond_signal(st_cond_t *);
int st_cond_broadcast(st_cond_t *);
int st_cond_destroy(st_cond_t *);

/* ws.c */
/* send a msg to worker W. W performs F(A). WM is a buffer to build 
   a messge. */
int st_send_worker_generic_msg(struct workers_list volatile * /* w */, 
				worker_msg_t /* wm */,
				void (* /* f */)(void *), void * /* a */);

/* list of current workers in group WG */
workers_list_t st_current_workers(void);
/* the number of workers in group WG */
int st_n_current_workers(void);
int st_worker_id();
void st_add_slave_worker(void);
void st_wg_exit(void *);
void st_wg_die(void *);

/* prof.c */
/* configure profile to use FILENAME as the prefix of the files.
   states shorter than RESOLUTION are merged into one. BUFFER_SIZE specifies
   the size of the in-memory buffer for each worker in bytes. */
/* void st_config_profile(st_prof_conf_t); */
void st_config_profile_resolution(int /* resolution */);
void st_config_profile_max_intervals(int /* buffer_size */);
void st_config_profile_filename_prefix(char * /* filename */);
/* begin profile */
void st_begin_profile(void);
/* end profile */
void st_end_profile(void);

/* current time in ms */
long st_current_time_ms(void);
/* current time in us */
long st_current_time_us(void);
/* sleep OS thread ms */
void st_sleep_os_thread_ms(int /* ms */);
void st_sleep_os_thread_us(int /* us */);
/* yield OS thread */
void st_yield_os_thread();

void st_tl_init(st_ticket_lock_t *);
long st_tl_lock(st_ticket_lock_t *);
void st_tl_unlock(st_ticket_lock_t *, long);

/* fix tls */
void * st_fix_tls(void);
int st_restore_tls(void *);
int st_dont_postprocess(void);

int st_thread_id();

/* number of workers specified by -nw */
int st_n_toplevel_workers(void);

/* frame pointer of the current procedure */
void * asm_get_fp(void);
/* stack pointer */
void * asm_get_sp(void);
/* global pointer */
#if HAVE_GP
void * asm_get_gp(void);
#endif

#if MO_R_BYPASS_R
void asm_membar_read_read(void);
#define MEMBAR_READ_READ() asm_membar_read_read()
#else  /* MO_R_BYPASS_R */
#define MEMBAR_READ_READ() do{}while(0)
#endif /* MO_R_BYPASS_R */

#if MO_R_BYPASS_W
void asm_membar_read_write(void);
#define MEMBAR_READ_WRITE() asm_membar_read_write()
#else  /* MO_R_BYPASS_W */
#define MEMBAR_READ_WRITE() do{}while(0)
#endif /* MO_R_BYPASS_W */

#if MO_W_BYPASS_W
void asm_membar_write_write(void);
#define MEMBAR_WRITE_WRITE() asm_membar_write_write()
#else  /* MO_W_BYPASS_W */
#define MEMBAR_WRITE_WRITE() do{}while(0)
#endif /* MO_W_BYPASS_W */

#if MO_W_BYPASS_R
void asm_membar_write_read(void);
#define MEMBAR_WRITE_READ() asm_membar_write_read()
#else  /* MO_W_BYPASS_R */
#define MEMBAR_WRITE_READ() do{}while(0)
#endif /* MO_W_BYPASS_R */

/* some apps declare main by themselves in a way incompatible to this */
/* entry */
#ifdef __cplusplus
extern "C" { int st_main(); }
#endif

/* the EXP must be a procedure call expression (arguments must not be
   a procedure call that may block). 
   EXP retruns not only by finishing evaluation of EXP, but also by
   blocking (including blocking for task stealing).

   tls(thread_blocked) is set to 1 (by suspend_thread_n) to indicate that
   the control reaches there because of blocking.

   when EXP simply finishes without blocking, the conditional 
   we simply decrement tls(n_total_threads) and continue.

   when EXP blocks and this is not triggered by task stealing request, 
   tls(thread_blocked) is 1 and tls(task_steal_context) is zero, so we 
   do not do anything and continue.

   when a task steal request arrives, tls(thread_blocked) is 1 and 
   tls(task_steal_context) is non-zero. the control first reaches the
   frame that should be stolen and tls(you_should_be_stolen) is true 
   there. hence, that frame suspends itself and the control reaches
   the frame below that frame. then the next frame executes 
   restart_context to restart execution from the point where 
   task steal request was picked up. */

/* a fork point is a procedure call point enclosed by 
   st_fork_block_start/st_fork_block_start0 and 
   st_fork_block_end/st_fork_block_end0. we use these two kinds of
   symbols to avoid an annoying optimization by GCC. GCC sometimes
   merges several call sites in C program into a single call site 
   in corresponding assembly. */
void __st_fork_block_start(void);
void __st_fork_block_end(void);
void __st_fork_block_start0(void);
void __st_fork_block_end0(void);
void __st_invalid_call_start(void);
void __st_invalid_call_end(void);

void asm_fix_callee_saves(void);

void st_restart_context_n(st_context_t, invalid_frame_desc_t, int);

void st_give_this_thread();

void st_give_specific_task(int, task_steal_request_t);

void st_child_was_given(invalid_frame_desc_t);
void st_respond_to_worker_msg(int /* specific_task? */, int /* in_user */);
#define st_respond_to_worker_msg_user(s) st_respond_to_worker_msg(s, 1)

void st_remove_exported_frames_aux(void *, int /* in_user */); 
#define st_remove_exported_frames_user() \
     st_remove_exported_frames_aux(asm_get_fp(), 1)

#define ST_STEAL_REQ_CHECK_1() \
do { \
  if (tls(worker_cell)->msg) st_respond_to_worker_msg_user(0); \
} while(0)

#define ST_FREE_STACK_1() \
  if (*tls(exported_frames)[0].rap == REMOVED_RA) \
    st_remove_exported_frames_user()

#define ST_STEAL_REQ_CHECK() \
do { \
  CHECK_TSS_INIT(); \
  ST_STEAL_REQ_CHECK_1(); \
} while(0)

#define ST_FREE_STACK() \
do { \
  CHECK_TSS_INIT(); \
  ST_FREE_STACK_1(); \
} while(0)

#define ST_POLLING() \
do { \
  CHECK_TSS_INIT(); \
  ST_FREE_STACK_1(); \
  ST_STEAL_REQ_CHECK_1(); \
} while(0)

#define ST_IN()  __tmp = tls(user[1]).__uslong
#define ST_OUT() tls(user[1]).__uslong = __tmp

#define ST_INCREMENT(d) __tmp += d
#define ST_CHECK(Lmin) \
do { \
  if (__tmp > Lmin) { \
    __tmp = 0; \
    ST_STEAL_REQ_CHECK_1(); \
  } \
} while(0)

#define ST_CHECK_COUNT(x,Lmin) \
do { \
  if (__tmp > Lmin) { \
    __tmp = 0; \
    ST_STEAL_REQ_CHECK_1(); \
  } \
  __tmp += x; \
} while(0)
/*
#define ST_INCREMENT(d) tls(user[1]).__uslong += d

#define ST_CHECK(Lmin) \
do { \
  if (tls(user[1]).__uslong > Lmin) { \
    ST_STEAL_REQ_CHECK_1(); \
    tls(user[1]).__uslong = 0; \
  } \
} while(0)
*/
/* macros to check the consistency of stack. 
   this checks if the frame pointer holds the same value before and 
   after a procedure call, and stack pointer is not shrinked before and 
   after a procedure call. note that stack pointer may not be the same.
*/

#if STACK_INV_CHECK

#if HAVE_GP
#define GET_GP() asm_get_gp()
#else  /* HAVE_GP */
#define GET_GP() ((void*)0)
#endif /* HAVE_GP */

#define DECLARE_STACK_INV_CHECK void * __fp_save__, * __gp_save__

/* save the current FP and GP */
#define SAVE_STACK_INV() \
do { \
  __fp_save__ = asm_get_fp(); \
  __gp_save__ = GET_GP(); \
} while(0) 

/* check if the FP saved before call equals to the current FP */

#define CHECK_STACK_INV() \
do { \
  if (__fp_save__ != asm_get_fp()) { \
    fprintf(stderr, "inconsistent stack at %s:%d\nfp before call = %p != fp after call %p\n", \
	    __FILE__, __LINE__, __fp_save__, asm_get_fp()); \
    st_app_exit(1); \
  } \
  if (__gp_save__ != GET_GP()) { \
    fprintf(stderr, "inconsistent stack at %s:%d\ngp before call = %p != gp after call %p\n", \
	    __FILE__, __LINE__, __gp_save__, GET_GP()); \
    st_app_exit(1); \
  } \
} while(0)

#else  /* STACK_INV_CHECK */

#define DECLARE_STACK_INV_CHECK void * __never_used_var__
#define SAVE_STACK_INV() do { } while(0)
#define CHECK_STACK_INV() do { } while(0)

#endif /* STACK_INV_CHECK */

#if ST_ASSERT_ON

#define st_assert(x) \
do { \
  if(!(x)) { \
    fprintf(st_errout, \
	    "%s : %d failed assertion: \"%s\" (thread_id = %ld)\n", \
	    __FILE__, __LINE__, #x, tls(thread_id)); \
    st_app_exit(1); \
  } \
} while(0)

#else  /* ST_ASSERT_ON */

#define st_assert(x) do { } while(0)

#endif /* ST_ASSERT_ON */

/* worker hooks */
typedef void (*thread_hook_t)(void);

#endif /* PROC_INFO_ONLY */

#ifdef USE_SMALLOC
void smalloc_init(int nw );
#endif

/* tentative implementation of st_alloca.
   the allocate region becomes part of the physically top-most frame.
   it is silently deallocated when the physically top-most frame is
   finished. therefore, it works only when the caller does not block. */

#define st_alloca(n) st_toriaezu_alloca(n)

void * asm_extend_stack(long);
char * print_toriaezu_alloca_result(int, void *);
#define st_toriaezu_alloca(n) \
print_toriaezu_alloca_result(n, SHRINK_STACK(asm_extend_stack(STACK_ALIGN_RAISE(n) + tls(fixed_max_sp_shrink_size)), tls(fixed_max_sp_shrink_size)))

#ifdef __cplusplus
} /* extern "C" */
#endif

#endif /* ifndef __ST_H__ */

