/* 
 * th.c --- LWP and shared-memory primitives
 */

/* 
 * Copyright (c) 1999 by Kenjiro Taura, Akinori Yonezawa. All rights reserved.
 * Copyright (c) 1999 by Yoshihiro Oyama, Toshio Endo. All rights reserved.
 * Copyright (c) 1999 by Kunio Tabata. All rights reserved.
 * Copyright (c) 1999 by Mitsubishi Research Institute.  All rights reserved.
 * Copyright (c) 1999 by Information-technology Promotion Agency.  All rights reserved.
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 */

#include <stdio.h>
#include <string.h>
#include <sys/time.h>
#include <ctype.h>
#include <stdlib.h>
#include <errno.h>

/* when adding something, make sure it is added for all the following cases */
#if defined(st_solaris_thread)	/* Solaris thread */
#elif defined(st_pthread)	/* Pthread */
#elif defined(st_old_pthread)	/* DEC's old pthread (mostly identical 
				   to Pthread) */
#elif defined(st_sfork_thread)	/* SGI's sproc */
#elif defined(st_nt_thread)	/* NT */
#elif defined(st_no_thread)	/* no threads */
#else
#error "specify thread package"
#endif


#if defined(sun) || defined(sgi) || defined(__osf__) || defined(__linux)
#include <sys/resource.h>	/* get/set rlimit */
#endif

/* OS-specific include files */
#if defined(sgi)
long sysconf(int);
long sginap(long);
#include <limits.h>		/* CLK_TCK */
#include <sched.h>		/* sched_yield */
/*
  On sgi, default thread stack size is equal to default process stack size,
  which is often too large. Such a large stack may result in memory exhaustion
  if there are many processors. That's why we set moderate size.
*/


#elif defined(sun)
#if 1
#include <time.h>		/* nanosleep */
#else
#include <unistd.h>		/* usleep */
#endif

#elif  defined(__osf__)
#include <time.h>
#include <unistd.h>
int usleep(useconds_t);

#elif defined(__linux)
#include <unistd.h>		/* gettimeofday, set/set rlimit */
#endif

#ifdef WITH_SGC
#include <sgc.h>
#endif

/* thread library-specific include files */
#if defined(st_solaris_thread)

#include <thread.h>
#ifdef WITH_SGC
#define SOLARIS_THREAD_CREATE GC_thr_create
#define SOLARIS_THREAD_JOIN GC_thr_join
#else
#define SOLARIS_THREAD_CREATE thr_create
#define SOLARIS_THREAD_JOIN thr_join
#endif

#define SET_STACKSIZE_BEFORE_CREATE 1
#define SET_STACKSIZE_AFTER_CREATE 0

#elif defined(st_pthread)||defined(st_old_pthread)/* continue: defined(SOLARIS_THREAD) */

#ifdef WITH_SGC
#define PTHREAD_CREATE GC_pthread_create
#define PTHREAD_JOIN GC_pthread_join
#else
#define PTHREAD_CREATE pthread_create
#define PTHREAD_JOIN pthread_join
#endif

/* workaround for OSF1. on OSF1, pthread.h include <c_asm.h>, which 
   contains a declaration of a procedure `asm.' this does not compile 
   with gcc, so we avoid it. */
#if defined(__osf__) && defined(__GNUC__)
#define __C_ASM_H

/* on Digital Unix 3.2, 
   prototype declaration of pthread_create in pthread.h is wrong.
   we do not use prototype on this platform.
   
   (currently it does not work on Digital UNIX 3.2 anyway)
     */
#define _CMA_PROTO_ 0
#endif

#include <pthread.h>
#include <strings.h>		/* for bcopy */

/* On sun, we still have an ability to set/get concurrency via Solaris 
   threads interface */
#if defined(sun)
void thr_yield(void);
int thr_getconcurrency(void);
int thr_setconcurrency(int);
#endif

#if defined(__linux)
/* linux does not have pthread_setstacksize */
#define SET_STACKSIZE_BEFORE_CREATE 0
#define SET_STACKSIZE_AFTER_CREATE 1
#else
#define SET_STACKSIZE_BEFORE_CREATE 1
#define SET_STACKSIZE_AFTER_CREATE 0
#endif

#elif defined(st_sfork_thread)	/* continue: defined(SOLARIS_THREAD) */

#if defined(sgi)
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sys/prctl.h>
#include <ulocks.h>
#define SFORK_DEFAULT_STACK_SIZE 4*1024*1024 /* 4Mbytes */

#ifdef WITH_SGC
#define SFORK_CREATE GC_sproc
#define SFORK_JOIN GC_waitpid
#else
#define SFORK_CREATE sproc
#define SFORK_JOIN waitpid
#endif

#elif defined(__linux)
#error "I am thinking about how to do it, sorry"
#include <linux/sched.h>
#include <linux/unistd.h>
   
#else  /* defined(sgi) */
#error "st_sfork_thread is not available on this machine"
#endif /* defined(sgi) */

#define SET_STACKSIZE_BEFORE_CREATE 1
#define SET_STACKSIZE_AFTER_CREATE 0

#elif defined(st_nt_thread)	/* continue: defined(SOLARIS_THREAD) */
#include <windows.h>

#define SET_STACKSIZE_BEFORE_CREATE 0
#define SET_STACKSIZE_AFTER_CREATE 0

#elif defined(st_no_thread)	/* continue: defined(SOLARIS_THREAD) */
#warning "no thread package won't be used (don't use -nw ..)"

#define SET_STACKSIZE_BEFORE_CREATE 0
#define SET_STACKSIZE_AFTER_CREATE 0

#else  /* defined(st_solaris_thread) || 
	  defined(st_pthread) || defined(st_old_pthread) ||
	  defined(st_sfork_thread) || defined(st_nt_thread) || 
	  defined(st_no_thread) */
#error "speicify thread package (set THREAD_PKG?)"

#endif  /* defined(st_solaris_thread) || 
	   defined(st_pthread) || defined(st_old_pthread) ||
	   defined(st_sfork_thread) || defined(st_nt_thread) || 
	defined(st_no_thread) */

#include <st.h>
#include "st_int.h"

#if SET_STACKSIZE_BEFORE_CREATE && SET_STACKSIZE_AFTER_CREATE
#error "set stacksize both before and after create"
#elif !SET_STACKSIZE_BEFORE_CREATE && !SET_STACKSIZE_AFTER_CREATE && !defined(st_no_thread)
#endif

/* deschedule an OS-thread */
PUBLIC void st_yield_os_thread()
{
#if defined(st_solaris_thread) || defined(st_pthread) || defined(st_old_pthread) || defined(st_nt_thread) || defined(st_sfork_thread)

#if defined(sgi)
  sched_yield();
#elif defined(sun)
  thr_yield();
#elif defined(__osf__) || defined(__linux)
  usleep(1);
#elif defined(_WIN32)
  Sleep(0);
#else
#waring "no way to yield a thread"
#endif /* defined(sgi) || defined(sun) || defined(_WIN32) */

#elif defined(st_no_thread)
  /* continue: 
     defined(st_solaris_thread) || 
     defined(st_pthread) || defined(st_old_pthread) || 
     defined(st_nt_thread) || defined(st_sfork_thread)
  */

  /* do nothing */

#else  /* continue: 
	  defined(st_solaris_thread) || 
	  defined(st_pthread) || defined(st_old_pthread) || 
	  defined(st_nt_thread) || defined(st_sfork_thread)
       */

#error "specify thread package"

#endif /* continue: 
	  defined(st_solaris_thread) || 
	  defined(st_pthread) || defined(st_old_pthread) || 
	  defined(st_nt_thread) || defined(st_sfork_thread) */
}

/* sleep an OS-thread ms */
PUBLIC void st_sleep_os_thread_ms(int ms) /* ms */
{

#if defined(sgi)
  static long clk_tck = 0;
  if (clk_tck == 0) {
    clk_tck = CLK_TCK;
  }
  sginap((ms * clk_tck) / 1000);
  /* end of defined(sgi) */
#elif defined(sun) 
#if 1
  /* nanosleep */
  struct timespec ts[1];
  ts->tv_sec = ms / 1000;
  ts->tv_nsec = (ms % 1000) * 1000 * 1000;
  nanosleep(ts, 0);
#else  /* 1 */
  usleep(ms * 1000);
#endif /* 1 */
  /* end of defined(sun) */

#elif defined(__linux) || defined(__osf__)
  usleep((unsigned)(ms * 1000));
  /* end of defined(__linux) || defined(__osf__) */

#elif defined(_WIN32)

  Sleep(ms);

#endif
}

/* sleep an OS-thread ms */
PUBLIC void st_sleep_os_thread_us(int us) /* us */
{

#if defined(sgi)
  static long clk_tck = 0;
  if (clk_tck == 0) {
    clk_tck = CLK_TCK;
  }
  sginap((us * clk_tck) / 1000000);
  /* end of defined(sgi) */
#elif defined(sun) 
#if 1
  /* nanosleep */
  struct timespec ts[1];
  ts->tv_sec = us / 1000000;
  ts->tv_nsec = (us % 1000000) * 1000;
  nanosleep(ts, 0);
#else  /* 1 */
  usleep(us);
#endif /* 1 */
  /* end of defined(sun) */

#elif defined(__linux) || defined(__osf__)
  usleep((unsigned)(us));
  /* end of defined(__linux) || defined(__osf__) */

#elif defined(_WIN32)

  Sleep(us / 1000);

#endif
}

/* returns current time in ms */
PUBLIC long st_current_time_ms()
{
#if defined(sgi) || defined(sun) || defined(__linux) || defined(__osf__) || defined(__CYGWIN32__)

    struct timeval tp[1];
    gettimeofday (tp, 0);
    return tp->tv_sec * 1000 + tp->tv_usec / 1000;

#elif defined(_WIN32)

#error "do this on win32"

#else
#warning "no way to get a time"
#endif

}

/* returns current time in microseconds */
PUBLIC long st_current_time_us()
{
#if defined(sgi) || defined(sun) || defined(__linux) || defined(__osf__) || defined(__CYGWIN32__)

    struct timeval tp[1];
    gettimeofday (tp, 0);
    return tp->tv_sec * 1000000 + tp->tv_usec;

#elif defined(_WIN32)

#error "do this on win32"

#else
#warning "no way to get a time"
#endif

}

/* set the size of the main stack (not thread stack) */
GLOBAL void set_proc_stack_size(long target)
{
#if defined(sgi) || defined(sun) || defined(__osf__) || defined(__linux)
  struct rlimit r[1];

  if( target == 0 ){
#if defined(sgi) && defined(st_sfork_thread)
    target = SFORK_DEFAULT_STACK_SIZE;
#else  
    return;
#endif  
  }
  if (getrlimit(RLIMIT_STACK, r) != 0) {
    fprintf(st_errout, "failed to obtain current stack limit\n");
    st_app_exit(1);
  }
  r->rlim_cur = target;
  /*
    Some systems have -1 for the rlim_max which causes setrlimit failure.
    We also use target for hard limit.
  */
  r->rlim_max = target;
    
  if (setrlimit(RLIMIT_STACK, r) != 0) {
    fprintf(st_errout, "failed to set stack limit to %ld bytes\n", target);
    st_app_exit(1);
  }
#elif defined(__CYGWIN32__)

#warning "no way to set process stack size (-ss does not work for main stack)"

#elif defined(_WIN32)

#error "do this on win32" 

#else
#warning "no way to set a stack limit"

#endif

}

void st_tl_init(st_ticket_lock_t * t)
{
  ST_INT_LOC_INIT(&t->no, 0);
  ST_INT_LOC_INIT(&t->serving, 0);
}

long st_tl_lock(st_ticket_lock_t * t)
{
  int no = st_fetch_and_add_int(&t->no, 1);
  while (st_read_int(&t->serving) < no) {
				/* do nothing */
  }
  return no;
}

void st_tl_unlock(st_ticket_lock_t * t, long no)
{
  st_write_and_unlock_int(&t->serving, no + 1);
}


/* find a free slot from array tss(thr_proc_wrapper_args). */
PRIVATE thr_proc_wrapper_arg_t allocate_thr_proc_wrapper_arg(void)
{
  int i, j;
  int n = N_THR_PROC_WRAPPER_ARGS;
  thr_proc_wrapper_arg_t t_args = tss(thr_proc_wrapper_args);
  
#if TMP
  int s;
  i = st_read_and_lock_any_int(&t_args[0].state, 
			       sizeof(struct thr_proc_wrapper_arg), n, &s);
  if (i == -1) {
    fprintf(st_errout, "%ld : we couldn't allocate thread lock\n", 
	    tls(thread_id));
  } else {
    return t_args + i;
  }

#else
  for (i = 0; i < LOCK_MAX_TRY; i++) {
    for (j = 0; j < n; j++) {
      long * sp = &t_args[j].state;
      long s = *sp;
      if (s == ST_LOCKED) continue;
      
#if HAVE_SWAP
      
      s = SWAP_LONG(sp, ST_LOCKED);
      if (s != ST_LOCKED) {
	/* we obtained the lock */
	return t_args + j;
      } else continue;
      
#elif HAVE_LL_SC || HAVE_CMP_AND_SWAP
      
      {
	long ok = CMP_AND_SWAP_LONG(sp, s, ST_LOCKED);
	if (ok) return t_args + j;
	else continue;
      }
      
#else 
      
#error "what do you think should I do"
      
#endif
    }
    st_sleep_thread_ms(1 << i);
  }
  
  fprintf(st_errout, "%ld : we couldn't allocate thread lock\n", 
	  tls(thread_id));
#endif

  st_app_die(1);
  return 0;			/* never reach */
}

#define MAX_DIGITS_IN_STACK_SIZE 11 /* upto 11 digits (i.e., < 10^12 = 1GB) */
GLOBAL long parse_stack_size(char * s)
{
  int i;
  long l = strlen(s);
  if (l > MAX_DIGITS_IN_STACK_SIZE) {
    fprintf(st_errout, 
	    "too large stack size specified in command line (%s)\n", s);
    st_app_exit(1);
  }
  /* check all characters except for the last one are digits */
  for (i = 0; i < l - 1; i++) {
    if (!isdigit((int)s[i])) {
      fprintf(st_errout, 
	      "invalid char in stack size specified in command line (%s)\n", 
	      s);
      st_app_exit(1);
    }
  }
  {
    long magnification;
    char lc = s[l - 1];
    if (lc == 'm' || lc == 'M') {
      magnification = 1024 * 1024;
    } else if (lc == 'k' || lc  == 'K') {
      magnification = 1024;
    } else if ('0' <= lc && lc <= '9') {
      magnification = 1;
    } else {
      fprintf(st_errout, 
	      "invalid char in stack size specified in command line (%s)\n", 
	      s);
      st_app_exit(1);
      magnification = 1;	/* to make gcc happy */
    }
    return magnification * atoi(s);
  }
}

extern thread_hook_t thread_start_hooks[];

PRIVATE void run_thread_start_hooks(void)
{
    thread_hook_t * hp = thread_start_hooks;
    thread_hook_t h = *hp;

    while (h){
	/* run hook */
	h();
	hp++;
	h = *hp;
    }
}

#if EXPLICIT_CONCURRENCY
/* adjust concurrency level of the underlying thread package by x.
   if this procedure is called two or more threads almost simultaneously, 
   some of them may be lost, primarily because thread packages do not provide
   a way to atomically increase or decrease concurrency.

   we could achieve atomicity by guarding calls to them by a lock, but
   users may directly create threads, in which case exact semantics of 
   concurrency cannot be achieved anyway.

   instead we just try to make simultaneous calls to this procedure
   unlikely to occur.  */
PRIVATE void add_concurrency(int x) 
{
  int err = 0;
  long no = st_tl_lock(tss(concurrency_lock));
#if defined(st_solaris_thread)	/* Solaris thread */
  int level = thr_getconcurrency();
  err = thr_setconcurrency(level + x);
  
#elif defined(st_pthread)	/* Pthread */
#if defined(sgi)
  int level = pthread_getconcurrency();
  err = pthread_setconcurrency(level + x);
  
#elif defined(sun)
  int level = thr_getconcurrency();
  err = thr_setconcurrency(level + x);

#else  /* defined(sgi) || defined(sun) */
#error "this system needs no explicit concurrency"
#endif /* defined(sgi) || defined(sun) */
  /* the end of pthread */
  
#elif defined(st_old_pthread)
#error "this system needs no explicit concurrency"
#elif defined(st_sfork_thread) || defined(st_nt_thread) || defined(st_no_thread)
#error "this system needs no explicit concurrency"
#else
#error "specify thread package"
#endif
  st_tl_unlock(tss(concurrency_lock), no);
  if (err) {
    fprintf(st_errout, "failed to set concurrency to %d\n", 
	    level + 1);
    st_app_die(1);
  }
}

/* reflect increase/decrease of the number of workers to the desirable
   concurrency */
PRIVATE void adjust_concurrency()
{
  int c = 0;
  while(1) {
    if (c > 5) {
      fprintf(st_errout, 
	      "warning: could not exactly update concurrency\n");
      break;
    } else if (st_read_long(&tss(unreflected_concurrency)) == 0){
      break;
    } else {
      long x = st_read_and_lock_long(&tss(unreflected_concurrency));
      if (x == 0) {
	st_write_and_unlock_long(&tss(unreflected_concurrency), 0); /*unlock*/
      } else if (x > 0) {
	st_write_and_unlock_long(&tss(unreflected_concurrency), 0); /*unlock*/
	add_concurrency(x);
      } else {
#if CONCURRENCY_CAN_DECREASE
	st_write_and_unlock_long(&tss(unreflected_concurrency), 0); /*unlock*/
	add_concurrency(x);
#else
	/* on Solaris, it seems that decreasing concurrency does not 
	   decrease number of LWPs (THR displayed by top command). 
	   on such machines, we leave the thread termination event unreflected.
	   when a thread is added later, this LWP may be reused	*/
	st_write_and_unlock_long(&tss(unreflected_concurrency), x); /*unlock*/
	break;
#endif
      }
      c++;
    }
  }
}
#endif /* EXPLICIT_CONCURRENCY */

/* called from thr_proc_wrapper in st_nopp.c, which saves/restores tls 
   register before/after calling this procedure */
GLOBAL void st_thr_proc_wrapper1(thr_proc_wrapper_arg_t arg)
{
  struct thread_local_storage tls_storage[1];
  
  thr_proc_t f;
#if (THR_PROC_N_PARAMS == 7)
  void * a0, * a1, * a2, * a3, * a4, * a5, * a6;
#else
#error change here accordingly when you change THR_PROC_N_PARAMS
#endif
  /* initialize tls (the very first procedure call) */
  st_init_tls(tls_storage);
#ifdef WITH_SGC
  /* initialize GC-tls, after the initialization of tls */
  SGC_init_local();
#endif
  
#if EXPLICIT_CONCURRENCY
  /* st_thr_create increased tss(unreflected_concurrency).
     here we reflect it and any yet unreflected concurrency caused 
     thereafter. */
  adjust_concurrency();
#endif
  /* if some sort of process creation system calls (such as sproc of sgi
     or clone in linux), we set stack size here */
#if SET_STACKSIZE_AFTER_CREATE
  {
    long ss = parse_stack_size(tls(gopts)->stack_size);
    set_proc_stack_size(ss);
  }
#endif

  /* read args and invoke f(a) */
  f = arg->f;
#if (THR_PROC_N_PARAMS == 7)
  a0 = arg->a[0]; a1 = arg->a[1]; a2 = arg->a[2];
  a3 = arg->a[3]; a4 = arg->a[4]; a5 = arg->a[5];
  a6 = arg->a[6];
  /* clear args just in case (for GCs sake)  */
  arg->a[0] = 0; arg->a[1] = 0; arg->a[2] = 0;
  arg->a[3] = 0; arg->a[4] = 0; arg->a[5] = 0;
  arg->a[6] = 0;
#else
#error change here accordingly when you change THR_PROC_N_PARAMS
#endif
  MEMBAR_READ_WRITE();
  st_write_and_unlock_int(&arg->state, 0); /* say I have read the arg */

  run_thread_start_hooks();
    
#if (THR_PROC_N_PARAMS == 7)
  f(a0, a1, a2, a3, a4, a5, a6);
#else
#error change here accordingly when you change THR_PROC_N_PARAMS
#endif
  
#if EXPLICIT_CONCURRENCY
  /* this thread dies */
  st_fetch_and_add_long(&tss(unreflected_concurrency), -1);
  adjust_concurrency();
#endif

}

/* 
   entry point of a thread
   before launching a thread, we save fixed register and set the
   pointer to TLS.  we must save fixed register because it is a
   callee-save register and this procedure is called BY a procedure
   that does not know it is fixed. the caller of this procedure thinks
   its value does not change across the call. */

GLOBAL void st_thr_proc_wrapper(thr_proc_wrapper_arg_t arg)
{
  /* although this is an upcall from external code, this case cannot 
     be handled in the same way as other cases. TLS has not been 
     allocated nor registered. */
  thread_local_storage_t save_tls = __tls;
  st_dont_postprocess();
  st_thr_proc_wrapper1(arg);
  __tls = save_tls;
}

/* the interface to create an LWP that does f(a) in an apropriately
   initialized environment.
   that is, tls has been initialized when f(a) is called.
   actually creates a thread that calls worker_func_wrapper, which 
   then calls f(a).

   since we have to put args and procedure in memory, why don't we
   extend the thread creation facility so that the procedure takes
   much more arguments.

 */

/* initialize thread attributes (executed only once) */
PRIVATE void thr_onetime_init(void)
{
  st_thr_attribute_t ta = tss(st_thr_attr);
  if (st_read_int(&ta->state) == (int)thr_attr_init_state_uninit) {
    /* I may be the first one that reaches this point.
       we first parse stack size given by the command line
       and then try to obtain lock */
    long ss = parse_stack_size(tss(global_options)->stack_size);
    int state;

 /* Pthread-speicific ones */
#if defined(st_pthread) || defined(st_old_pthread)
    pthread_attr_t attr[1];
    pthread_condattr_t condattr[1];
    pthread_mutexattr_t mutexattr[1];
    /* initialize worker attribute if I am the first thread that
       reaches this point */
    int err;
#if defined(st_pthread)
    pthread_attr_init(attr);
    pthread_condattr_init(condattr);
    pthread_mutexattr_init(mutexattr);
#elif defined(st_old_pthread)
    pthread_attr_create(attr);
    pthread_condattr_create(condattr);
    pthread_mutexattr_create(mutexattr);
#endif /* defined(st_pthread) || defined(st_old_pthread) */

#if SET_STACKSIZE_BEFORE_CREATE
    if (ss) {
      err = pthread_attr_setstacksize(attr, ss);
      if (err) {
	fprintf(st_errout, "failed set stack size of worker\n");
	st_app_exit(1);
      }
    }
#endif

#if defined(sgi)

    /* nullify the default value (0) */
    err = pthread_setconcurrency(1);
    if (err) {
      fprintf(st_errout, "failed to set concurrency to 1\n");
      st_app_exit(1);
    }

#endif /* defined(sgi) */

#endif /* defined(st_pthread) || defined(st_old_pthread) */

    state = st_read_and_lock_int(&ta->state);
    if (state == thr_attr_init_state_uninit) {
      /* I am the first one that reaches this point and have
	 responsibility to initialize attributes */
      ta->stack_size = ss;	/* stack size */
#if defined(st_pthread) || defined(st_old_pthread)
      /* copy pthread-specific thread attributes */
      bcopy((void*)attr, (void*)(ta->attr), sizeof(pthread_attr_t));
      bcopy((void*)condattr, (void*)(ta->condattr), 
	    sizeof(pthread_condattr_t));
      bcopy((void*)mutexattr, (void*)(ta->mutexattr), 
	    sizeof(pthread_mutexattr_t));
#endif /* defined(st_pthread) || defined(st_old_pthread) */

#if defined(st_sfork_thread) && defined(sgi)
      /* set max allowable calls to sproc (currently 512) */
      {
	  int max_allowable_sprocs = 512;
	  if (-1 == usconfig(CONF_INITUSERS, max_allowable_sprocs)) {
	      fprintf(st_errout, "usconfig(CONF_INITUSERS, %d) failed\n", 
		      max_allowable_sprocs);
	      st_app_exit(1);
	  }
      }
#endif

      MEMBAR_WRITE_WRITE();
      /* unlock */
      st_write_and_unlock_int(&ta->state, (int)thr_attr_init_state_inited);
    } else {
      st_assert(state == (int)thr_attr_init_state_inited);
      st_write_and_unlock_int(&ta->state, (int)thr_attr_init_state_inited);
    }
  }
}

GLOBAL void st_thr_create(thr_proc_t f, st_thr_id * id,
#if (THR_PROC_N_PARAMS == 7)
			  void * a0, void * a1, void * a2, 
			  void * a3, void * a4, void * a5,
			  void * a6
			  
#else
#error change here accordingly when you change THR_PROC_N_PARAMS
#endif
)
{
  int err;
  thr_proc_wrapper_arg_t wa;
  
  thr_onetime_init();
  wa = allocate_thr_proc_wrapper_arg();
  wa->f = f; 
#if (THR_PROC_N_PARAMS == 7)
  wa->a[0] = a0; wa->a[1] = a1; wa->a[2] = a2; 
  wa->a[3] = a3; wa->a[4] = a4; wa->a[5] = a5;
  wa->a[6] = a6;
  
#else
#error change here accordingly when you change THR_PROC_N_PARAMS
#endif

#if EXPLICIT_CONCURRENCY
  st_fetch_and_add_long(&tss(unreflected_concurrency), 1);
  /* st_thr_create increased tss(unreflected_concurrency).
     here we reflect it and any yet unreflected concurrency caused 
     thereafter. */
  adjust_concurrency();
#endif

#if defined(st_solaris_thread)

#if defined(sun)

  {
    err = SOLARIS_THREAD_CREATE(0, tss(st_thr_attr)->stack_size,
				(void*(*)(void*))st_thr_proc_wrapper, 
				(void*)wa, 
				0, /* THR_BOUND | THR_NEW_LWP */
				&id->solaris_thread_id);
  }

#else  /* defined(sun) */

#error "st_solaris_thread is not available on this machine"

#endif /* defined(sun) */

  /* continue #if defined(st_solaris_thread) */
#elif defined(st_pthread) || defined(st_old_pthread)

#if defined(sun) || defined(sgi) || defined(__osf__) || defined(__linux)

  {
    err = PTHREAD_CREATE(&id->pthread_id, tss(st_thr_attr)->attr,
			 (void*(*)(void*))st_thr_proc_wrapper, (void*)wa); 
  }

#else  /* defined(sun) || defined(sgi) || 
	  defined(__osf__) || defined(__linux)*/

#error "st_pthread is not available on this machine"
  
#endif /* defined(sun) || defined(sgi) || 
	  defined(__osf__)  || defined(__linux)*/

#elif defined(st_sfork_thread)	/* continue #if defined(st_solaris_thread) */

  /* there are no specific reasons why sproc of SGI and clone of linux 
     are written in the same section. we may be able to merge these two
     and winnt thread as "os-specific" threads 
  */

#if defined(sgi)
  {
    /* where do we set stack size for it? */
    pid_t pid = SFORK_CREATE((void (*)(void *))st_thr_proc_wrapper, PR_SALL, (void*)wa);
    if (-1 == pid) {
      err = 1;
    } else if (pid == 0) {
      /* child */
      err = 0;
    } else {
      /* parent */
      id->pid = pid;
      err = 0;
    }
  }

#elif defined(__linux)

#error "I am thinking about how to do it, sorry"
  {
    /* we allocate a specified stack size + a page.
       we later set stack size by means of specifying process stack size */

    int size = tss(st_thr_attr)->stack_size + getpagesize();
    void * sp = (void *)st_malloc(size);
    if (sp == 0) {
      fprintf(st_errout, "cannot allocate stack of %d bytes\n", size);
      err = 1;
    } else {
      pid_t pid = clone(sp, SIGCLD);
      if (-1 == pid) {
	err = 1;
      } else {
	err = 0;
      }
    }
  }

#else  /* defined(sgi) */

#error "st_sfork_thread is not available on non-SGI machines"

#endif /* defined(sgi) */

#elif defined(st_nt_thread)	/* continue #if defined(SOLARIS_THREAD) */

  {
    HANDLE hThread;
    DWORD threadId;
    hThread = CreateThread(NULL, 0, 
			   (LPTHREAD_START_ROUTINE)st_thr_proc_wrapper, 
			   (LPVOID)wa, 0, &threadId);

    if (hThread) {
      id->nt_thread_handle = hThread;
      err = 0;
    } else {
      err = 1;
    }
  }

#elif defined(st_no_thread)	/* continue #if defined(SOLARIS_THREAD) */

  err = 1;

#else  /* continue #if defined(st_solaris_thread) */

#error "have some thread package"

#endif /* #if defined(st_solaris_thread) || 
	  defined(st_pthread) || defined(st_old_pthread) || 
	  defined(st_sfork_thread) || defined(st_nt_thread) || 
	  defined(st_no_thread) 
       */

  if (err) {
    fprintf(st_errout, "failed to create thread\n");
#if defined(sgi)
    fprintf(st_errout, "did you set usconfig?\n");
#endif
    st_app_die(1);
  }
}

/* try to wait for termination of a thread of ID target. */
GLOBAL int st_thr_join(st_thr_id_t target)
{
  int err = 0;
#if defined(st_solaris_thread)
  {
    int r = SOLARIS_THREAD_JOIN(target->solaris_thread_id, 
		     /* departed */ 0, /* statusp */ 0);
    st_assert(r != ESRCH);
    st_assert(r != EDEADLK);
    err = r;
  }
#elif defined(st_pthread) || defined(st_old_pthread)
  {
    int r = PTHREAD_JOIN(target->pthread_id, /* statusp */ 0);
    st_assert(r != ESRCH);
    st_assert(r != EDEADLK);
    err = r;
  }
#elif defined(st_sfork_thread)	/* SGI's sproc */
  {
    int status;
    pid_t target_pid = target->pid;
    pid_t pid = SFORK_JOIN(target_pid, &status, 0);
    if (pid == target_pid) {
      err = 0;
    } else {
      fprintf(st_errout, 
	      "%ld : signal is delivered during waiting for a child\n", 
	      tls(thread_id));
      err = 1;
    }
  }
#elif defined(st_nt_thread)	/* NT */
  {
    if (WaitForSingleObject(target->nt_thread_handle, INFINITE) 
	== WAIT_OBJECT_0) {
      /* moved from st_thr_create according to Nishioka-san's suggestion */
      CloseHandle(target->nt_thread_handle);
      err = 0;
    } else {
      err = 1;
    }
  }
#elif defined(st_no_thread)	/* no threads */
  err = 1;			/* error */
#else
#error "specify thread package"
#endif
  return err;
}

/* called upon global initialization (st_init_tss) */
GLOBAL void make_tls_key()
{
  int err = 0;
#if defined(st_solaris_thread)
  thread_key_t key;
  if (thr_keycreate(&key, 0) != 0) err = 1;
  else err = 0;
  tss(tls_fix)->key = key;
#elif defined(st_pthread) 
  pthread_key_t key;
  if (pthread_key_create(&key, 0) != 0) err = 1;
  else err = 0;
  tss(tls_fix)->key = key;
#elif defined(st_old_pthread)
  pthread_key_t key;
  if (pthread_keycreate(&key, 0) != 0) err = 1;
  else err = 0;
  tss(tls_fix)->key = key;
#elif defined(st_sfork_thread)
  /* do nothing */
#elif defined(st_nt_thread)
  DWORD key = TlsAlloc();
  if (key == 0xffffffff) err = 1;
  else {
    err = 0;
    tss(tls_fix)->key = key;
  }
#elif defined(st_no_thread)
  /* do nothing */
#else
#error "specify thread package"
#endif
  if (err) {
    fprintf(st_errout, "could not setup for tls registration\n");
    st_app_exit(1);
  }
}

/* register T as the valid tls pointer of this (OS-level) thread */
GLOBAL void st_set_tls(thread_local_storage_t t)
{
  int err = 0;
#if defined(st_solaris_thread)
  if (thr_setspecific(tss(tls_fix)->key, (void *)t) != 0) err = 1;
  else err = 0;
#elif defined(st_pthread) || defined(st_old_pthread)
  if (pthread_setspecific(tss(tls_fix)->key, (void *)t) != 0) err = 1;
  else err = 0;
#elif defined(st_sfork_thread)
#if defined(sgi)
  /* On SGI, we use facility provided by IRIX OS that provides a
     per-process private region located at a constant address (PRDA).
     this may conflict with other libraries like MPI, so we issue a 
     warning */
#define SGI_ST_PRDA_AREA usr_prda.fill
#define SGI_ST_PRDA_AREA_SIZE sizeof(thread_local_storage_t)
#warning "StackThreads uses a part of PRDA for our purposes (check conflicts)"
  *((thread_local_storage_t*)(PRDA->SGI_ST_PRDA_AREA)) = t;
#else  /* defined(sgi) */
#error "st_sfork_thread is currently available only on SGI"
#endif /* defined(sgi) */
#elif defined(st_nt_thread)
  if (TlsSetValue(tss(tls_fix)->key, (LPVOID)t) == TRUE) err = 0;
  else err = 1;
#elif defined(st_no_thread)
  /* since we have a single thread, we simply use a global variable */
  tss(tls_fix)->the_tls = t;
#else
#error "specify thread package"
#endif
  if (err) {
    fprintf(stderr, "%ld : could not set tls\n", tls(thread_id));
    st_app_die(1);
  }
}

/* called by a procedure compiled by stgcc with possibly invalid tls pointer.
   fix it. primarily used by a callback procedure */
PUBLIC void * st_fix_tls()
{
  int err = 0;
  thread_local_storage_t t;
#if defined(st_solaris_thread)
  if (thr_getspecific(tss(tls_fix)->key, (void *)(&t)) != 0) err = 1;
  else err = 0;
#elif defined(st_pthread) || defined(st_old_pthread)
  t = (thread_local_storage_t)pthread_getspecific(tss(tls_fix)->key);
  if (t == 0) err = 1;
  else err = 0;
#elif defined(st_sfork_thread)
#if defined(sgi)
  /* see comments in st_set_tls */
  t = *((thread_local_storage_t*)(PRDA->SGI_ST_PRDA_AREA));
#else  /* defined(sgi) */
#error "st_sfork_thread is currently available only on SGI"
#endif /* defined(sgi) */
#elif defined(st_nt_thread)
  t = (thread_local_storage_t)TlsGetValue(tss(tls_fix)->key);
  if (t == 0) err = 1;
  else err = 0;
#elif defined(st_no_thread)
  /* since we have a single thread, we simply use a global variable */
  t = tss(tls_fix)->the_tls;
#else
#error "specify thread package"
#endif
  if (err) {
    fprintf(stderr, "%ld : could not get tls\n", tls(thread_id));
    st_app_die(1);
    return 0;			/* never reach */
  } else {
    void * old_tls = (void *)__tls;
    __tls = t;
    return old_tls;
  }
}

PUBLIC int st_restore_tls(void * t)
{
  st_dont_postprocess();
  __tls = (thread_local_storage_t)t;
  return 0;
}

/* initialize oneshot synchronization variable */

GLOBAL void st_oneshot_sync_var_init(st_oneshot_sync_var_t w)
{
  int err = 0;
  thr_onetime_init();		/* initialize default attributes (pthreads) */
#if defined(st_solaris_thread)
  {
    int r = mutex_init(w->m, USYNC_THREAD, 0);
    if (r == 0) {
      int r = cond_init(w->c, USYNC_THREAD, 0);
      if (r == 0) err = 0;
      else err = 1;
    } else {
      err = 1;
    }
  }
#elif defined(st_pthread) || defined(st_old_pthread)
  {
    int r = pthread_mutex_init(w->m, tss(st_thr_attr)->mutexattr);
    if (r == 0) {
      int r = pthread_cond_init(w->c, tss(st_thr_attr)->condattr);
      if (r == 0) {
	err = 0;
      } else {
	err = 1;
      }
    } else {
      err = 1;
    }
  }
#elif defined(st_sfork_thread) 
  {
    ST_INT_LOC_INIT(&w->pid, 0);
  }
#elif defined(st_nt_thread)
  {
    HANDLE h = CreateSemaphore(NULL, 0, 1, NULL);
    if (h) w->h = h;
    else err = 1;
  }
#elif defined(st_no_thread)
  {
    w->done = 0;
  }
#else
#error "specify thread package"
#endif
  if (err) {
    fprintf(st_errout, "%ld : could not init st_oneshot_sync_var\n", 
	    tls(thread_id));
    st_app_die(1);
  }
  ST_INT_LOC_INIT(&w->waited, 0);
  ST_INT_LOC_INIT(&w->signaled, 0);
}

/* wait for W to be signaled */

GLOBAL void st_oneshot_sync_var_wait(st_oneshot_sync_var_t w)
{
  int err = 0;
  {
    /* check nobody has called wait on this sync var */
    int l = st_read_and_lock_int(&w->waited);
    if (l) {
      st_write_and_unlock_int(&w->waited, l);
      fprintf(st_errout, 
	      "%ld : st_thr_oneshot_sync_var_wait performed twice\n",
	      tls(thread_id));
      st_app_die(1);
    } else {
      /* first time (OK) */    
      st_write_and_unlock_int(&w->waited, 1);
    }
  }

#if defined(st_solaris_thread)
  {
    /* if somebody has called sianal, we can proceed (even if the call
       has not completed yet) */
    int l = st_read_int(&w->signaled);
    if (l == 0) {
      /* grab the lock to serialize signal and wait */
      if (mutex_lock(w->m) != 0) {
	mutex_unlock(w->m);
	fprintf(st_errout, "%ld : mutex_lock faied\n", tls(thread_id));
	st_app_die(1);
      }
      /* if the above lock follows the signaler, w->signaled must be 1.
	 otherwise, we wait on the condition variable and the signaler 
      will find that somebody is waiting on the condition variable */
      l = st_read_int(&w->signaled);
      if (l) {
	mutex_unlock(w->m);
      } else {
	while(1) {
	  if (cond_wait(w->c, w->m) != 0) {
	    if (st_read_int(&w->signaled)) {
	      break;
	    } else {
	      fprintf(st_errout, 
		      "%ld : cond_wait interrupted by signal? (still wait)\n",
		      tls(thread_id));
	    }
	  } else {
	    st_assert(st_read_int(&w->signaled));
	    break;
	  }
	}
	mutex_unlock(w->m);
      }
    }
  }

#elif defined(st_pthread) || defined(st_old_pthread)
  {
    int l = st_read_int(&w->signaled);
    if (l == 0) {
      /* grab the lock to serialize signal and wait */
      if (pthread_mutex_lock(w->m) != 0) {
	pthread_mutex_unlock(w->m);
	fprintf(st_errout, "%ld : pthread_mutex_lock faied\n", tls(thread_id));
	st_app_die(1);
      }
      /* if the above lock follows the signaler, w->signaled must be 1.
	 otherwise, we wait on the condition variable and the signaler 
      will find that somebody is waiting on the condition variable */
      l = st_read_int(&w->signaled);
      if (l) {
	pthread_mutex_unlock(w->m);
      } else {
	while(1) {
	  if (pthread_cond_wait(w->c, w->m) != 0) {
	    if (st_read_int(&w->signaled)) {
	      break;
	    } else {
	      fprintf(st_errout, 
		      "%ld : cond_wait interrupted by signal? (still wait)\n",
		      tls(thread_id));
	    }
	  } else {
	    st_assert(st_read_int(&w->signaled));
	    break;
	  }
	}
	pthread_mutex_unlock(w->m);
      }
    }
  }

#elif defined(st_sfork_thread) 
  {
    /* register the name of the waiting process. */
    pid_t pid = getpid();
    long old_pid = st_read_and_lock_long(&w->pid);
    st_write_and_unlock_long(&w->pid, pid); /* unlock */
    st_assert(old_pid == 0);
    st_assert(sizeof(long) >= sizeof(pid_t));
    blockproc(pid);
  }
#elif defined(st_nt_thread)
  {
    if (WAIT_OBJECT_0 != WaitForSingleObject(w->h, INFINITE)) err = 1;
    else err = 0;
  }
#elif defined(st_no_thread)
  {
    if (!w->done) {
      err = 1;
    } else err = 0;
  }
#else
#error "specify thread package"
#endif
  if (err) {
    fprintf(st_errout, 
	    "%ld : thread block wait failed\n", tls(thread_id));
    st_app_die(1);
  }
}

/* signal W */

GLOBAL void st_oneshot_sync_var_signal(st_oneshot_sync_var_t w)
{
  int err = 0;
  {  
    int l = st_read_and_lock_int(&w->signaled);
    if (l) {
      st_write_and_unlock_int(&w->signaled, l);
      fprintf(st_errout, 
	      "%ld : st_thr_oneshot_sync_var_wait performed twice\n",
	      tls(thread_id));
    } else {
      /* first time (OK) */    
      st_write_and_unlock_int(&w->signaled, 1);
    }
  }
#if defined(st_solaris_thread)
  {
    if (mutex_lock(w->m) != 0) {
      mutex_unlock(w->m);
      fprintf(st_errout, "mutex_lock faied\n");
      st_app_die(1);
    }
    if (cond_signal(w->c) != 0) {
      fprintf(st_errout, "cond_signal faied\n");
      st_app_die(1);
    }
    mutex_unlock(w->m);
  }
  
#elif defined(st_pthread) || defined(st_old_pthread)
  {
    if (pthread_mutex_lock(w->m) != 0) {
      pthread_mutex_unlock(w->m);
      fprintf(st_errout, "mutex_lock faied\n");
      st_app_die(1);
    }
    if (pthread_cond_signal(w->c) != 0) {
      fprintf(st_errout, "cond_signal faied\n");
      st_app_die(1);
    }
    pthread_mutex_unlock(w->m);
  }

#elif defined(st_sfork_thread) 
  {
    long pid = st_read_and_lock_long(&w->pid);
    st_write_and_unlock_long(&w->pid, pid); /* unlock */
    if (pid) unblockproc(pid);
  }
#elif defined(st_nt_thread)
  {
    LONG p[1];
    if (ReleaseSemaphore(w->h, 1, p) == FALSE) err = 1;
    else err = 0;
  }
#elif defined(st_no_thread)
  {
    w->done = 1;
  }
#else
#error "specify thread package"
#endif
  if (err) {
    fprintf(st_errout, 
	    "%ld : thread block wait signal failed\n",
	    tls(thread_id));
    st_app_die(1);
  }
}

/* 1 if w has been signaled */

GLOBAL int st_oneshot_sync_var_signaled(st_oneshot_sync_var_t w)
{
  int r;
#if defined(st_solaris_thread)
  if (st_read_int(&w->signaled)) r = 1;
  else r = 0;
#elif defined(st_pthread) || defined(st_old_pthread)
  if (st_read_int(&w->signaled)) r = 1;
  else r = 0;
#elif defined(st_sfork_thread) 
  if (st_read_long(&w->pid)) r = 1;
  else r = 0;
#elif defined(st_nt_thread)
  if (WAIT_OBJECT_0 == WaitForSingleObject(w->h, 0)) r = 1;
  else r = 0;
#elif defined(st_no_thread)
  if (w->done) r = 1;
  else r = 0;
#else
#error "specify thread package"
#endif
  return r;
}

PUBLIC int st_thread_id()
{
  return tls(thread_id);
}

