/* 
 * ws.c --- work sharing facilities
 */

/* 
 * Copyright (c) 1999 by Kenjiro Taura, Akinori Yonezawa. All rights reserved.
 * Copyright (c) 1999 by Yoshihiro Oyama, Toshio Endo. All rights reserved.
 * Copyright (c) 1999 by Kunio Tabata. All rights reserved.
 * Copyright (c) 1999 by Mitsubishi Research Institute.  All rights reserved.
 * Copyright (c) 1999 by Information-technology Promotion Agency.  All rights reserved.
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 */

#include <strings.h>
#include <st.h>
#include "st_int.h"

#define SHOW_IDLE 0

PUBLIC int stf_init_worker_group_conf(worker_group_conf_t c)
{
  ST_CALLBACK_BEGIN();

  c->print_stat = 0;
  c->prof_conf.do_profile = 0;
  c->prof_conf.resolution = 100;
  c->prof_conf.max_intervals = 8100;
  c->prof_conf.filename_prefix = 0;

  ST_CALLBACK_END();
  return 0;
}

PUBLIC int stf_worker_group_conf_set_print_stat(worker_group_conf_t c, int ps)
{
  ST_CALLBACK_BEGIN();
  c->print_stat = ps;
  ST_CALLBACK_END();
  return 0;
}

PUBLIC int stf_worker_group_conf_get_print_stat(worker_group_conf_t c)
{
  ST_CALLBACK_BEGIN();
  int r = c->print_stat;
  return r;
  ST_CALLBACK_END();
}

PUBLIC int stf_worker_group_conf_set_prof_do_profile(worker_group_conf_t c, int dp)
{
  ST_CALLBACK_BEGIN();
  int r;
  if (dp && c->prof_conf.filename_prefix == 0) {
    fprintf(st_errout, 
	    "worker %ld : do_profile, yet worker_group_conf_set_prof_filename "
	    "has not been called\n", tls(worker_id));
    st_app_die(1);
    r = -1;
  } else {
    c->prof_conf.do_profile = dp;
    r = 0;
  }
  ST_CALLBACK_END();
  return r;
}

PUBLIC int stf_worker_group_conf_get_prof_do_profile(worker_group_conf_t c)
{
  ST_CALLBACK_BEGIN();
  int r = c->prof_conf.do_profile;
  ST_CALLBACK_END();
  return r;
}

PUBLIC int stf_worker_group_conf_set_prof_resolution(worker_group_conf_t c, int r)
{
  ST_CALLBACK_BEGIN();
  c->prof_conf.resolution = r;
  ST_CALLBACK_END();
  return 0;
}

PUBLIC int stf_worker_group_conf_get_prof_resolution(worker_group_conf_t c)
{
  ST_CALLBACK_BEGIN();
  int r = c->prof_conf.resolution;
  ST_CALLBACK_END();
  return r;
}

PUBLIC int stf_worker_group_conf_set_prof_buffer_size(worker_group_conf_t c, int bs)
{
  ST_CALLBACK_BEGIN();
  c->prof_conf.max_intervals = bs;
  ST_CALLBACK_END();
  return 0;
}

PUBLIC int stf_worker_group_conf_get_prof_buffer_size(worker_group_conf_t c)
{
  ST_CALLBACK_BEGIN();
  int r = c->prof_conf.max_intervals;
  return r;
  ST_CALLBACK_END();
}

PUBLIC int stf_worker_group_conf_set_prof_filename(worker_group_conf_t c, char * name)
{
  ST_CALLBACK_BEGIN();
  if (name == 0) {
    fprintf(st_errout, 
	    "worker %ld : worker_group_conf_set_prof_filename is given "
	    "a zero as a filename", 
	    tls(worker_id));
    st_app_die(1);
    return -1;
  }
  c->prof_conf.filename_prefix = name;
  ST_CALLBACK_END();
  return 0;
}

PUBLIC char * stf_worker_group_conf_get_prof_filename(worker_group_conf_t c)
{
  ST_CALLBACK_BEGIN();
  char * r = c->prof_conf.filename_prefix;
  ST_CALLBACK_END();
  return r;
}

GLOBAL void init_worker_group(worker_group_t wg, worker_group_conf_t wgc)
{
  ST_LONG_LOC_INIT(&wg->worker_id_seed[0], 0);
  /* initially, task requests ports is empty */
  wg->workers = 0;	
  ST_LONG_LOC_INIT(&wg->n_workers, 0);
  /* we are not done, of course */
  st_oneshot_sync_var_init(wg->done_var);
  ST_INT_LOC_INIT(&wg->status, wg_status_running);
  wg->return_value = 0;
  wg->entry_returned = 0;
  /* nobody has noticed we are done, of course */
  ST_INT_LOC_INIT(&wg->n_quittings, 0);
  ST_INT_LOC_INIT(&wg->profiling, profiling_stopping);
  /* set configuration */
  if (wgc) {
    wg->conf = *wgc;
  } else {
    stf_init_worker_group_conf(&wg->conf);
  }
}

/* return the list of current workers. */
GLOBAL workers_list_t st_current_workers_1(worker_group_t wg)
{
  return (workers_list_t)st_read_ptr((st_ptr_loc_t *)(&wg->workers));
}

PUBLIC workers_list_t st_current_workers()
{
  return st_current_workers_1(tls(wg));
}

/* return the current number of workers.
   may be larger than the number of workers in the list. */
GLOBAL int st_n_current_workers_1(worker_group_t wg)
{
  return st_read_int(&wg->n_workers);
}

PUBLIC int st_n_current_workers()
{
  return st_n_current_workers_1(tls(wg));
}

PUBLIC int st_worker_id()
{
  return tls(worker_id);
}

/* add my port of task steal request to the list of available 
   task request ports */

PRIVATE void add_to_workers(workers_list_t cell, worker_group_t wg, long wid)
{
  workers_list_t head;
  /* set worker ID */
  tls(worker_id) = wid;
  tls(wg) = wg;

  /* initialize the cell */
  ST_INT_LOC_INIT(&cell->lock, 0);
  cell->msg = 0;
  cell->worker_tls = __tls;
  cell->wg = wg;
  cell->thread_id = tls(thread_id);
#if ST_DBG
  cell->worker_id = tls(worker_id);
#endif /* ST_DBG */
 
  cell->prev = 0;
  
  /* add this cell to the list of WORKERS in this group */
  head = (workers_list_t)st_read_and_lock_ptr((st_ptr_loc_t *)&wg->workers);
  if (head) head->prev = cell;
  cell->next = head;
  MEMBAR_WRITE_WRITE();
  st_write_and_unlock_ptr((st_ptr_loc_t *)&wg->workers, 
			  (void*)cell); /* unlock */
  /* here WORKERS and N_WORKERS are inconsistent */

  /* increment the number of workers. */
  st_fetch_and_add_int(&wg->n_workers, 1);
}

PRIVATE int accum_worker_group_stat(worker_group_t wg, thread_stat_t st)
{
  workers_list_t p;
  int nw = 0;
  workers_list_t ws = st_current_workers_1(wg);
  st->n_forks = 0;
  st->n_blocks = 0;
  st->n_steals = 0;
  st->n_steal_timeouts = 0;
  st->n_steal_requests_to_idle = 0;

  for (p = ws; p; p = p->next) {
    st->n_forks += p->worker_tls->_thread_stat.n_forks;
    st->n_blocks += p->worker_tls->_thread_stat.n_blocks;
    st->n_steals += p->worker_tls->_thread_stat.n_steals;
    st->n_steal_timeouts += p->worker_tls->_thread_stat.n_steal_timeouts;
    st->n_steal_requests_to_idle += p->worker_tls->_thread_stat.n_steal_requests_to_idle;
    nw++;
  }
  return nw;
}

PRIVATE void print_worker_group_stat(worker_group_t wg)
{
#if ST_STAT
  struct thread_stat st[1];
  int nw = accum_worker_group_stat(wg, st);
  fprintf(st_errout, 
	  "%ld forks  %ld blocks  %ld steals  %ld timeouts "
	  "%ld requests_to_idle %d workers\n", 
	  st->n_forks, st->n_blocks, st->n_steals, st->n_steal_timeouts, 
	  st->n_steal_requests_to_idle, nw);
#else
  fprintf(st_errout, "worker stat not available\n");
#endif
}

/* when a task steal request is caught, unwind stack until the bottom
   and call this procedure.
   give this (fine-grain) thread to the requester.
 */
USED_BY_MACRO void st_give_this_thread()
{
  struct st_context c[1];
  task_steal_request_t req = tls(task_steal_request);
  tls(you_should_be_stolen) = 0;
  tls(task_steal_request) = 0;
  THR_STAT_INC(n_steals); 

  c->valid = 0;
  MEMBAR_WRITE_WRITE();
  req->reply = c;
#if ST_PROF
  st_prof_switch();
#endif /* ST_PROF */
  st_suspend_thread_n(c, 1);	/* ask the parent to jump to the throw
				   point. */
}

USED_BY_MACRO void st_child_was_given(invalid_frame_desc_t iff)
{
  st_context_t tsc = tls(task_steal_context); 
  tls(task_steal_context) = 0; 
  tls(thread_blocked) = 0; 
  tls(n_total_threads)++; 
#if ST_PROF
  st_prof_switch(); 
#endif
  st_restart_context_n(tsc, iff, 2);
}

/* give a task at N th bottom most position in the resumed contexts stack.
   i.e., give_resumed_task(0) gives the bottom task in the stack.
   assume the stack has at least N items.
   also assume that this processor has already grabbed a request from 
   a processor and the request is REQ. */

PRIVATE void give_resumed_task(int n, task_steal_request_t req)
{
  st_context_t b = tls(resumed_contexts_bottom);
  st_context_t p;
  int i;
  /* scan the stack from bottom to top */
  for (i = 0, p = b; i < n; i++, p = p->q_prev) {
    st_assert(p);
  }
  /* here P points to N th element (counting the bottom element as 0 th) */
  st_assert(p);

  {
    short nt = p->n_threads;
    st_context_t a = p->q_prev;
    st_context_t b = p->q_next;

    /* i.e., 
       q_prev links look like: 
       BOTTOM -> ... -> b -> p -> a -> ... -> TOP 
       q_next links look like: 
       BOTTOM <- ... <- b <- p <- a <- ... <- TOP */
    
    /* nullify links in p */
    p->q_prev = 0;
    p->q_next = 0;
    
    /* give P to the requester */
    req->reply = p;
    
    /* modify links to maintain the stack */
    if (a) {
      a->q_next = b;
    } else {
      tls(resumed_contexts_top) = b;
    }
    if (b) {
      b->q_prev = a;
    } else {
      tls(resumed_contexts_bottom) = a;
    }
    
    st_assert(nt == 1);
    tls(n_resumed_threads) -= nt;
    tls(n_total_threads) -= nt;
  }
}

/* give a thread at N (> 0) th topmost position in the C stack.
   i.e., give_stacked_task(1) gives the thread just below the current
   thread.
   assume the stack has at least N items.
   also assume that this processor has already grabbed a request from 
   a processor and the request is REQ. */

PRIVATE void give_stacked_task(int n, task_steal_request_t req)
{
  struct st_context c[1];
  c->valid = 0;
  tls(you_should_be_stolen) = 1;
  tls(task_steal_context) = c;
  tls(task_steal_request) = req;
  st_suspend_thread_n(c, n);
  st_assert(tls(you_should_be_stolen) == 0);
}


/* called any time. check if a request arrived at this processor.
   if it does and if this stack has more than one fine-grain
   threads, give the bottom frame to the requester. */

#define I_HAVE_NO_TASK ((st_context_t)-1)

/* give task n threads down from the stack top.
   the sender must be still waiting for the reply */
USED_BY_MACRO void st_give_specific_task(int n, task_steal_request_t req)
{
  if (n <= 0) {
    /* refer to a meaningless thread */
    req->reply = I_HAVE_NO_TASK;
  } else if (n < tls(n_total_threads) - tls(n_resumed_threads)) {
#if ST_PROF
    st_prof_serv_steal();
#endif /* ST_PROF */
    /* there are at least n + 1 tasks in the stack */
    give_stacked_task(n, req);
#if ST_PROF
    st_prof_busy();
#endif /* ST_PROF */
  } else if (n < tls(n_total_threads)) {
#if ST_PROF
    st_prof_serv_steal();
#endif /* ST_PROF */
    give_resumed_task(tls(n_total_threads) - n - 1, req);
#if ST_PROF
    st_prof_busy();
#endif /* ST_PROF */
  } else {
    /* no tasks */
    req->reply = I_HAVE_NO_TASK;
  }
}

/* called at anytime.
   check if there is a msg in the incoming msg port (tls(worker_cell)->msg).
   if there is, try to pick up it. if succeed, take an appropriate action
   according to the msg type. 

   the parameter specific_task is zero if it responds to a task steal msg
   by giving the bottom task. otherwise the parameter specifies the 
   depth of the thread in the stack, counting the current thread as zero. */

USED_BY_MACRO void st_respond_to_worker_msg(int specific_task, int in_user)
{
  worker_msg_t wm = tls(worker_cell)->msg;
  if (wm == 0) return;
  else {
    enum worker_msg_kind k;
    int success;
    worker_msg_t * wmp = &tls(worker_cell)->msg;
    /* try to remove the request from the request buffer.
       this may compete with requester's attempt to remove it from
       the buffer due to timeout. */
#if HAVE_LL_SC || HAVE_CMP_AND_SWAP
    long ok = CMP_AND_SWAP_LONG((long*)wmp, (long)wm, 0);
    success = ok;
#elif HAVE_SWAP
    long x = SWAP_LONG((long*)wmp, 0);
    success = (x != 0);
#else  /* HAVE_LL_SC || HAVE_CMP_AND_SWAP || HAVE_SWAP */
#error "what do you think should I do?"
#endif /* HAVE_LL_SC || HAVE_CMP_AND_SWAP || HAVE_SWAP */
    if (!success) return;

    /* a msg has been picked up.
       at this point, it is unsafe to read wmp again. the next msg
       may have come. 
    */
    k = wm->k;
    switch (k) {
    case wmk_task_steal_request:
      {
	if (tls(n_total_threads) > 1) {
	  if (specific_task) {
	    st_give_specific_task(specific_task, &wm->m.tsr);
	  } else {		/* bottom task */
	    st_give_specific_task(tls(n_total_threads) - 1, &wm->m.tsr);
	  }
	} else {
	  /* we would like to handle this case without removing the task
	     from the port. but it is impossible under the current protocol,
	     because the requester may have removed the task from the port */
	  wm->m.tsr.reply = I_HAVE_NO_TASK;
	}
	break;
      }
    case wmk_generic: 
      {
	worker_msg_generic_t wmg = &wm->m.wmg;
#if ST_DBG
	tls(in_handler) = 1;
#endif
#if ST_PROF
	st_prof_serv_msg();
#endif /* ST_PROF */
	(wmg->f)(wmg->a);
#if ST_DBG
	tls(in_handler) = 0;
#endif
	break;
      }
    default:
      {
	fprintf(st_errout, "unknown worker msg type %d\n", k);
	st_app_die(1);
      }
    }
#if ST_RPOF
    if (in_user) st_prof_busy();
#endif
  }
}

/* send worker_msg WM to worker W. W executes F(A). 
   reutrn 0 if the msg is successfully sent. 
   return -1 otherwise. */
PUBLIC int st_send_worker_generic_msg(struct workers_list volatile * w_, 
				      worker_msg_t wm,
				      void (*f)(void *), void * a)
{
  struct workers_list * w = (struct workers_list *)w_;
  int dum;
  st_assert(!tls(in_handler));
#if ST_PROF
  st_prof_msg();
#endif /* ST_PROF */
  wm->k = wmk_generic;
  wm->m.wmg.f = f;
  wm->m.wmg.a = a;


  if (st_try_read_and_lock_int(&w->lock, &dum) != 0) {
    return -1;		/* FAIL */
  }
  w_->msg = wm;
  while(w_->msg) {
    st_respond_to_worker_msg_sys(0);
#if ST_PROF
    st_prof_msg();
#endif /* ST_PROF */
  }
  st_write_and_unlock_int(&w->lock, 0);	/* unlock */
  return 0;			/* OK */
  /* the message has been picked up by the receiver */
}

/* try to obtain a task from a worker that belongs to the same group
   (WG). assume SELF is an element of WG->WORKERS

   the algorithm:
   try each processor once. give up requesting from a processor
   either when it is locked or it replies I_HAVE_NO_TASK. return 0 if it
   finds no task after asking each processor once.
   
   getting zero as the result value does not mean everybody is idle.
   so this procedure should be somehow interleaved with termination 
   detection. */

#define WS_RAND_MULT 1103515245
#define WS_RAND_ADD 12345
#define WS_RAND_MASK 0x7FFFFFFF

enum { WS_RAND_A = (WS_RAND_MULT & WS_RAND_MASK),
       WS_RAND_B = (WS_RAND_ADD & WS_RAND_MASK) };

PRIVATE int ws_random(void)
{
  tls(ws_rand_last) = tls(ws_randx);
  tls(ws_randx) = ((WS_RAND_A * tls(ws_randx) + WS_RAND_B) & WS_RAND_MASK);
  return tls(ws_rand_last);
}

PRIVATE void init_ws_random(int seed)
{
  tls(ws_randx) = (seed & WS_RAND_MASK);
  ws_random();
}

#define NEVER_CANCEL_REQUEST -1
#define VERBOSE_WS 0

PRIVATE st_context_t try_request_task(workers_list_t self, 
				      worker_group_t wg
#if VERBOSE_WS
				      , int verbose
#endif
				      )
{
  struct worker_msg msg[1];
  struct task_steal_request volatile * req = &msg->m.tsr;
  workers_list_t start, base, p;
  int first_time = 1;

#if VERBOSE_WS
  if (verbose) {
    printf("worker %ld : request\n", tls(worker_id));
  }
#endif
  /* scan the worker list to find a busy processor */
  
  st_assert(!tls(in_handler));

#if ST_PROF
  st_prof_steal();
#endif /* ST_PROF */

  /* write the tag */
  msg->k = wmk_task_steal_request;
#if ST_DBG
  msg->sender_worker_id = tls(worker_id);
  msg->sender_thread_id = tls(thread_id);
#endif /* ST_DBG */

  {
    /* select base randomly */
    int r;
    int step;
    int nw = st_n_current_workers_1(wg);
    /* NOTE:
       st_n_current_workers_1(wg) never
       returns a value larger than the length of 
    st_current_workers_1(wg). so we never run out of the list */
    
    base = st_current_workers_1(wg);
    step = ws_random() % nw;
    
    st_assert(base);
    for (r = 0; r < step; r++) {
      base = base->next;
#if ST_DBG
      if (base == 0) {
	fprintf(st_errout, 
		"ran out current_workers list (n_current_workers = %d), "
		"but the list had only %d elements\n",
		nw, r);
	st_app_die(1);
      }
#endif
    }
  }

  p = base;
  start = base->next;		/* because we do p = p->next first */
  first_time = 1;

  while(1) {
    req->reply = 0;
    MEMBAR_WRITE_WRITE();
    p = p->next;
    /* true either when this is the very first iteration or 
       we have circulated the entire list and came back to the original
       position */
    if (p == start) {
      if (first_time) {
	first_time = 0;
      } else {
	/* came back to the original cell. give up */
#if VERBOSE_WS
	if (verbose) {
	  printf("worker %ld : give up\n", tls(worker_id));
	}
#endif
	return 0;
      }
    }

    if (p == 0) {
      /* we overran the end of list. read the head. */
      p = st_current_workers_1(wg);
      st_assert(p);
    }
    
    if (p == self) {
      /* skip self */
#if VERBOSE_WS
      if (verbose) {
	printf("worker %ld : skip self %p\n", tls(worker_id), p);
      }
#endif
      continue;
    } else {
      if (ST_INT_LOC_LOCKED(&p->lock)) {
#if VERBOSE_WS
	if (verbose) {
	  printf("worker %ld : skip locked %p\n", tls(worker_id), p);
	}
#endif
	/* somebody is already requesting. try another worker */
	continue;
      } else {
	/* wait_limit : how many iterations to wait until cancel 
	   warn_limit : when failed to cancel the request, how many
	iterations to wait until printing a warning msg. */
	int wait_limit, warn_limit;
	int request_canceled;
	int got_lock;

	int _;
	int x = st_try_read_and_lock_int(&p->lock, &_);
	if (x == -1) {
	  got_lock = 0;
	} else {
	  st_assert(x == 0);
	  got_lock = 1;
	}

	if (!got_lock) {
	  /* I competed with somebody and lost. try another worker */
#if VERBOSE_WS
	  if (verbose) {
	    printf("worker %ld : skip locked (competed) %p\n", 
		   tls(worker_id), p);
	  }
#endif
	  continue;
	}
	/* we got a lock. now write request and wait for the reply 
	   from the victim */
	
	/* remember req is allocated in this frame.
	   once a processor requests a task from another processor,
	   either request is canceled by this processor, or picked up
	   by the victim. once the victim picks up the request, 
	   the processor someday replies exactly one reply for this
	   request. this procedure must not exit until it receives 
	the reply. */
	
	/* send request: (discard `volatile') */
	p->msg = (worker_msg_t)msg;
	/* wait for reply */
	wait_limit = tss(global_options)->steal_wait_limit;
	warn_limit = wait_limit + tss(global_options)->steal_wait_warn_limit;
	request_canceled = 0;
	while(1) {
	  if (req->reply) {
	    if (wait_limit < 0) {
	      /* wait limit has passed. we are here because we couldn't
		 cancel request. */
#if VERBOSE_WS
	      fprintf(st_errout, "worker %ld : OK, got a reply from %ld\n", 
		      tls(worker_id), p->worker_id);
#endif
	    }
	    break;
	  }
	  st_assert(tls(n_total_threads) == 0);
	  /* say I have no tasks to somebody who is requested me */
	  st_respond_to_worker_msg_sys(0);
#if ST_PROF
	  st_prof_steal();
#endif /* ST_PROF */

	  if (wait_limit == 0) {
	    /* try to cancel the request. this may compete with the victim's
	       attempt to pick up the request */
#if HAVE_LL_SC || HAVE_CMP_AND_SWAP
	    st_assert(msg);
	    {
	      long ok = CMP_AND_SWAP_LONG((long*)&p->msg, (long)msg, 0);
	      if (ok) request_canceled = 1;
	    }
#elif HAVE_SWAP
	    {
	      long x = SWAP_LONG((long*)&p->msg, 0);
	      if (x != 0) {
		st_assert(x == (long)msg);
		request_canceled = 1;
	      }
	    }
#else  /* HAVE_LL_SC || HAVE_CMP_AND_SWAP || HAVE_SWAP */
	    
#error "what do you think should I do?"
	    
#endif /* HAVE_LL_SC || HAVE_CMP_AND_SWAP || HAVE_SWAP */

	    if (request_canceled) {
	      THR_STAT_INC(n_steal_timeouts);
	      break;
	    } else {
#if VERBOSE_WS
	      fprintf(st_errout, 
		      "worker %ld : couldn't cancel request to %ld. "
		      "still wait for reply\n", 
		      tls(worker_id), p->worker_id);
#endif
	    }
	  } /* if(wait_limit == 0) */
	  

	  /* time_limit should have been passed, we failed to remove 
	     request there (meaning the victim picked up the request), 
	  but we do not still receive reply. something will be wrong. */
	  if (warn_limit == 0) {
	    fprintf(st_errout, 
		    "worker %ld : warning: do not get reply after %d waits "
		    "(after the request has been picked up by %ld). "
		    "something is probably wrong (may be the victim died "
		    "during giving me a task). we still keep going\n",
		    tls(worker_id), - wait_limit, p->thread_id);
	  }

	  /* wait_limit == NEVER_CANCEL_REQUEST
	     means we never cancel a request */
	  if (wait_limit != NEVER_CANCEL_REQUEST) {
	    wait_limit--;
	    warn_limit--;
	  }
	} /* while(req->reply == 0) */

	if (request_canceled) {
	  st_write_and_unlock_int(&p->lock, 0);
#if VERBOSE_WS
	  if (verbose) {
	    printf("worker %ld : canceled %p\n", tls(worker_id), p);
	  }
#endif
	  continue;
	} else if (req->reply == I_HAVE_NO_TASK) {
	  /* this processor does not have a task.
	     unlock and try another worker */
	  THR_STAT_INC(n_steal_requests_to_idle);
	  st_write_and_unlock_int(&p->lock, 0);
#if VERBOSE_WS
	  if (verbose) {
	    printf("worker %ld : no task %p\n", tls(worker_id), p);
	  }
#endif
	  continue;
	} else {
	  /* we got a task */
	  st_context_t rep = req->reply;
	  st_write_and_unlock_int(&p->lock, 0);
#if VERBOSE_WS
	  if (verbose) {
	    printf("worker %ld : got a task %p\n", tls(worker_id), p);
	  }
#endif
	  return rep;
	} /* if */
      }	/* if (l == ST_LOCKED) */
    } /* if (p == self) */
  } /* while(1) */
  /* NEVER REACH */
}

#if 0
PUBLIC void remove_from_workers(workers_list_t cell, work_group_t wg)
{
  /* i dont think this is possible.
     consider interactions between task stealers */
}
#endif

/* make the world SPMD (urawaza) */



/* initialize tls related to worker */
PRIVATE void init_worker(worker_group_t wg, long wid)
{
  add_to_workers(tls(worker_cell), wg, wid);
  /* initialization of work stealing data structure */
  tls(n_resumed_threads) = 0;
  tls(n_total_threads) = 0;
  tls(you_should_be_stolen) = 0;
  tls(thread_blocked) = 0;
  tls(task_steal_context) = 0;
  tls(task_steal_request) = 0;

  tls(n_child_workers) = 0;
  tls(max_child_workers) = 0;
  tls(child_workers) = 0;

  /* initialize statistics */
  tls(thread_stat).n_forks = 0;
  tls(thread_stat).n_blocks = 0;
  tls(thread_stat).n_steals = 0;
  tls(thread_stat).n_steal_timeouts = 0;
  tls(thread_stat).n_steal_requests_to_idle = 0;

  /* profiling */
#if ST_PROF
  tls(profile) = 0;
#endif

  /* initialize random generator state */
  init_ws_random(tls(worker_id));
}

/* a table that contains worker startup hooks. generated by stlink. */

extern thread_hook_t worker_start_hooks[];

PRIVATE void run_worker_start_hooks(void)
{
  thread_hook_t * hp = worker_start_hooks;
  thread_hook_t h = *hp;
  
  while (h){
    /* run hook */
    h();
    hp++;
    h = *hp;
  }
}

/* call f(a0, a1, a2, a3). when f finishes, write the result to
   wg->return_value and write 1 to wg->done. */
PRIVATE void master_worker_entry(worker_group_t wg, 
				 worker_proc_t f, 
				 void * a0, void * a1, void * a2, void *a3)
{
  void * r;
  DECLARE_STACK_INV_CHECK;

  SAVE_STACK_INV();
  r = f(a0, a1, a2, a3);
  CHECK_STACK_INV();

  /* write return value. tell other workers in this group that
     we can quit. */
  if (st_determine_wg_return_value(wg, r) == 0) {
    wg->entry_returned = 1;
  }
}

/* add a slave to the worker group which the calling thread belongs to.
   can be called only after the calling thread has performed INIT_WORKER. */
void st_add_slave_worker(void);

/* add the current OS-thread to the worker group WG.
   if f != 0, the current OS-thread is the master worker of WG, 
   and the parameter WID_OR_NW is the initial number of workers 
   to be created. otherwise it is a slave and WID_OR_NW is the worker
   id. the master spawns workers and starts f(a0, a1, a2, a3).
   after f returns, synchronize every OS-thread in this group.
   the master (f != 0) returns the return value of f. slaves return zero.
*/

PRIVATE void * 
worker_proc1(worker_group_t wg, long wid_or_nw, worker_proc_t f
#if (WORKER_PROC_N_PARAMS == 4)
	     , void * a0, void * a1, void * a2, void * a3
#else
#error change here accordingly when you change WORKER_PROC_N_PARAMS
#endif
	     )
{
  DECLARE_STACK_INV_CHECK;
  long wid = (f ? 0 : wid_or_nw);
  init_worker(wg, wid);

#if ST_PROF
  st_prof_setup_worker();
#endif /* ST_PROF */

  /* if I am the master, run the entry procedure */
  if (f) {
    int i;
    int nw = (int)wid_or_nw;
    st_assert(tls(worker_cell)->worker_id == 0);
#if ST_PROF
    st_prof_busy();
#endif /* ST_PROF */
    /* create slave workers */
    for (i = 1; i < nw; i++) {
      st_add_slave_worker();
    }
    
    /* start profiling */
    st_config_profile_1(wg, &wg->conf.prof_conf);
    if (wg->conf.prof_conf.do_profile) {
      st_begin_profile_1(wg);
    }

    /* run hooks */
    run_worker_start_hooks();
  
    /* we do not want to count this fork as a fork */
    THR_STAT_DEC(n_forks);
    SAVE_STACK_INV();
#if (WORKER_PROC_N_PARAMS == 4)
    /* invoke the entry procedure. this may return either because
       it is finished, is suspended, or exited by calling st_wg_exit. */
    PROC_FORK(master_worker_entry(wg, f, a0, a1, a2, a3));
#else  /* (WORKER_PROC_N_PARAMS == 4) */
#error change here accordingly when you change WORKER_PROC_N_PARAMS
#endif /* (WORKER_PROC_N_PARAMS == 4) */
    CHECK_STACK_INV();
  } else {
    st_assert(tls(worker_cell)->worker_id > 0);
#if ST_PROF
    st_prof_idle();
#endif /* ST_PROF */

    /* run hooks */
    run_worker_start_hooks();
  }

  {
    /* IDLE LOOP (not really idle. it really means no work on stack,
       but there may be some resumed contexts) */
    int n_idle_iterations = 0;	/* how many times did I loop in idle */
    /* wg->status maybe locked */
    while(!ST_INT_LOC_CHECK(&wg->status, wg_status_exited)) { 
      st_context_t t;
      
      /* we should have no threads on the C stack */
      st_assert(tls(n_total_threads) - tls(n_resumed_threads) == 0);
      
      /* we should have no invalid frames */
      st_assert(tls(fixed_invalid_frames) == 0);
      
      /* it is a good time to remove captured contexts */
      if (n_idle_iterations == 0) {
	st_remove_exported_frames_sys();
      }
      
      /* if I have a task on the stack of resumed threads, schedule it */
      if (tls(resumed_contexts_top)) {
	struct invalid_frame_desc iff[1];
#if ST_PROF
	st_prof_switch();
#endif /* ST_PROF */
	SAVE_STACK_INV();
	INVALID_PROC_FORK_X(st_schedule_resumed_context(tls(resumed_contexts_top), 
							iff), iff);
	CHECK_STACK_INV();
	continue;
      }
      
      st_assert(tls(n_total_threads) == 0);
      
      /* say I have no tasks in case somebody requested me */
      SAVE_STACK_INV();
      st_respond_to_worker_msg_sys(0);
      CHECK_STACK_INV();
      
      /* try to steal work from other stacks */
      SAVE_STACK_INV();
#if VERBOSE_WS
      /* when we want to see what is happening in a thief, set the
	 last argument to 1. */
      t = try_request_task(tls(worker_cell), wg, 0);
#else
      t = try_request_task(tls(worker_cell), wg);
#endif
      CHECK_STACK_INV();
      
      if (t) {
	/* we got a job from another stack */
	struct invalid_frame_desc iff[1];
	/* remove exported frames once again (there may be some
	   finished frames after this worker became idle) */
	SAVE_STACK_INV();
	st_remove_exported_frames_sys();
	CHECK_STACK_INV();
#if ST_PROF
	st_prof_switch();
#endif /* ST_PROF */
	SAVE_STACK_INV();
	INVALID_PROC_FORK_X(st_restart_context_n(t, iff, 1), iff);
	CHECK_STACK_INV();
	n_idle_iterations = 0;
      } else {
#if SHOW_IDLE
	if ((n_idle_iterations + 1) % 2048 == 0) {
	  fprintf(st_errout, "worker %d : idle\n", tls(worker_id));
	}
#endif

#ifdef WITH_SGC
	if (n_idle_iterations == 100) {
	  SAVE_STACK_INV();
	  st_remove_exported_frames_sys();
	  CHECK_STACK_INV();
#if VERBOSE_WS
	  fprintf(st_errout, 
		  "worker %ld : clear stack after a long idle period (stack size = %ld bytes)\n", 
		  tls(worker_id), st_stack_used_bytes());
#endif
	  SGC_CLEAR_STACK();
	}
#endif

	/* if a worker fails to steal a task from another thread
	   several times, it sleeps. this is important because this
	   thread should not try to lock message ports of other
	   threads intensively (otherwise, other threads are likely to
	   fail to send messages to them). */

	{
#if ST_PROF
	  st_prof_sleep();
#endif
	  st_sleep_os_thread_us(100); /* sleep 100 us */
#if ST_PROF
	  st_prof_steal();
#endif
	}

	n_idle_iterations++;
      }	/* if (t) */
    } /* while(wg->status != wg_status_exited) */
  }

  /* stop profiling */
  if (f) {
    if (wg->conf.prof_conf.do_profile) {
      st_end_profile_1(wg);
    }
    if (wg->conf.print_stat) print_worker_group_stat(wg);
  }

  /* finalize:
     somebody may still be requesting work from this stack.
     reject these requests */
  {
    int nw = st_n_current_workers_1(wg);
    int i_am_last;
    st_remove_exported_frames_sys();
#if ST_PROF
    st_prof_idle();
#endif /* ST_PROF */

    {
      /* say I have noticed we are done */
      int sq = st_fetch_and_add_int(&wg->n_quittings, 1);
      if (sq + 1 == nw) {
	i_am_last = 1;
      } else {
	i_am_last = 0;
      }
    }

    /* wait until everybody has noticed we are done */
    while (st_read_int(&wg->n_quittings) < nw) {
      st_assert(tls(n_total_threads) == 0);
      /* say I have no tasks when somebody requested me */
      st_respond_to_worker_msg_sys(0);
#if ST_PROF
      st_prof_idle();
#endif /* ST_PROF */
    }
    st_remove_exported_frames_sys();
#if ST_PROF
    st_prof_idle();
#endif /* ST_PROF */

    /* the master signals, if any, external waiter that we are done. 
       note that WG may be supplied by the external waiter.
       once it is signaled, the external waiter can reuse that storage.
       so we must not touch WG after it is signaled.
       in particular, slaves have no ideas about when the signaling occurs,
       so it cannot read wg->return_value here. */
    if (f) {
      /* tell if any OS-thread (which does not belong to this group)
	 that we are done. */
      void * r = wg->return_value;
      st_oneshot_sync_var_signal(wg->done_var);
      return r;
    } else {
      return 0;
    }
  }
}

/* an entry point from thr_proc_wrapper. 
   the master of a worker group is given an entry procedure of the
   group (F) as well as its arguments (Ai). 
   slaves are given a zero as F. they wait for the master to obtain
   the worker_id == 0 (to guarantee that the master's id is always zero). */
PRIVATE void * worker_proc(void * wg_, void * wid_or_nw_, void * f_
#if (WORKER_PROC_N_PARAMS == 4)
			   , void * a0, void * a1, void * a2, void * a3
#else
#error change here accordingly when you change WORKER_PROC_N_PARAMS
#endif
			   )
{
  /* cast arguments to meaningful types */
  worker_proc_t f = (worker_proc_t)f_;
  worker_group_t wg = (worker_group_t)wg_;
  long wid_or_nw = (long)wid_or_nw_;
  void * initial_sp = asm_get_sp();
  
  void * r = worker_proc1(wg, (long)wid_or_nw, f, a0, a1, a2, a3);

  /* everybody has noticed that the world has finished. 
     check if we did not do anything wrong */
  {
    int ok = 1;

    ST_FREE_STACK();

    if (tls(n_exported_frames) == 1) {
      void * final_sp = asm_get_sp();
      if (final_sp SP_LT initial_sp) {
	fprintf(st_errout, 
		"worker %ld : something wrong in stack management "
		"(initially SP = %p but eventually SP = %p)\n",
		tls(worker_id), initial_sp, final_sp);
	ok = 0;
      }
      if (final_sp SP_GT GROW_STACK(initial_sp, tss(max_sp_shrink_size))) {
	fprintf(st_errout, 
		"worker %ld : some frames were not deallocated "
		"(initially SP = %p but eventually SP = %p)\n", 
		tls(worker_id), initial_sp, final_sp);
	ok = 0;
      }
    } else {			/* no captured tasks on this stack */
      if (wg->entry_returned) {
	/* this means there are captured tasks that have not yet finished */
	fprintf(st_errout, 
		"worker %ld : some threads seem still blocked on exit "
		"(FP = %p, SP = %p)\n", 
		tls(worker_id), asm_get_fp(), asm_get_sp());
	/* st_show_exported_frames(); */
	ok = 0;
      } else {
#if 0
	if (tls(worker_id) == 0) {
	  fprintf(st_errout, "worker 0 : st_wg_exit was called\n");
	}
#endif
	ok = 1;
      }
    }
    if (!ok) {
      /* something wrong may have happend, but it is no longer a time 
	 to worry about it. quit anyway. */
    } else {
      /* here everything is OK. we may print something. */
    }
  }

  /* join with, if any, created child threads  */
  {
    int i;
    int n = tls(n_child_workers);
    for (i = 0; i < n; i++) {
      if (st_thr_join(tls(child_workers) + i) != 0) {
	fprintf(st_errout, "worker %ld : couldn't join a child\n", 
		tls(worker_id));
      } 
    }
  }

  return r;			/* WG->return_value (master) or 0 (slave) */
}

/* make the current thread the mater (worker_id == 0) of the worker group WG.
   spawn slave workers that share work derived from F(A0, A1, A2, A3, A4).*/
GLOBAL void * become_master_worker(worker_group_t wg, /* worker group */
				   int nw, /* initial number of workers */
				   worker_proc_t f, /* entry procedure */
#if (WORKER_PROC_N_PARAMS == 4)
				   void * a0, void * a1, void * a2, void * a3
#else
#error change here accordingly when you change WORKER_PROC_N_PARAMS
#endif
				   )
{
  /* the master also increments the worker_id_seed, but ignore the
     result. master's id is always 0 */
  void * r;
  long wid = st_fetch_and_add_long(wg->worker_id_seed, 1);
  st_assert(wid == 0);
  r = worker_proc((void *)wg, (void *)(long)nw, (void *)f, a0, a1, a2, a3);
  return r;
}

#define INITIAL_CHILD_WORKERS_SIZE 16
/* guarantee tls(child_workers) has at least N entries */
PRIVATE void ensure_child_workers(int n)
{
  int old_max = tls(max_child_workers);
  if (n > old_max) {
    st_thr_id_t old_workers = tls(child_workers);
    int new_max = (old_max > 0 ? old_max * 2 : INITIAL_CHILD_WORKERS_SIZE);
    /* allocate new buffer */
    st_thr_id_t new_workers 
      = (st_thr_id_t)st_malloc(sizeof(struct st_thr_id) * new_max);
    if (new_workers == 0) {
      /* buffer allocation failed */
      fprintf(st_errout, 
	      "worker %ld : could not allocate child_workers buffer of %d entries\n",
	      tls(worker_id), new_max);
      st_app_die(1);
    } else {
      bzero((void *)new_workers, sizeof(struct st_thr_id) * new_max);
      if (old_workers) {
	/* an old buffer already existed.
	   copy the old contents to the new one and free the old array. */
	bcopy((void *)old_workers, (void *)new_workers, 
	      sizeof(struct st_thr_id) * old_max);
	st_free((void *)old_workers);
      }
      /* set the new buffer */
      tls(max_child_workers) = new_max;
      tls(child_workers) = new_workers;
    }
  }
}

PUBLIC void st_add_slave_worker(void)
{
  st_thr_id tid;
  worker_group_t wg = tls(wg);
  long wid = st_fetch_and_add_long(wg->worker_id_seed, 1);
  int ncw;
  st_assert(wid > 0);

  st_thr_create(worker_proc, &tid,
		(void *)wg, (void *)wid, /* f = */ 0, 
		(void *)0, (void *)0, (void *)0, (void *)0);
  /* remember the thread ID (in the underlying thread package) of the 
     child worker, so that we can later join it */
  ncw = tls(n_child_workers);
  ensure_child_workers(ncw + 1);
  tls(child_workers)[ncw] = tid;
  tls(n_child_workers) = ncw + 1;
}

/* create a separate thread that becomes the master of worker group WG */
GLOBAL void create_master_worker(worker_group_t wg, /* worker group */
				 int nw, /* initial number of workers */
				 worker_proc_t f, /* entry procedure */
#if (WORKER_PROC_N_PARAMS == 4)
				 void * a0, void * a1, void * a2, void * a3
#else
#error change here accordingly when you change WORKER_PROC_N_PARAMS
#endif
				 )
{
  /* see the above comment on worker id */
  st_thr_id tid;
  long wid = st_fetch_and_add_long(wg->worker_id_seed, 1);
  st_assert(wid == 0);
  st_thr_create(worker_proc, &tid,
		(void *)wg, (void *)(long)nw, (void *)f, a0, a1, a2, a3);
}

/* become a group of workers that performs f(a0, a1, a2, a3). */

GLOBAL void * become_worker_group(worker_group_conf_t wgc, /* configuration */
				  int nw, /* initial number of workers */
				  worker_proc_t f, /* entry procedure */
#if (WORKER_PROC_N_PARAMS == 4)
				  void * a0, void * a1, 
				  void * a2, void * a3
#else
#error change here accordingly when you change WORKER_PROC_N_PARAMS
#endif
				     )
{
  void * r;
  struct worker_group wg[1];

  init_worker_group(wg, wgc);
  /* this thread becomes the master */
  r = become_master_worker(wg, nw, f, a0, a1, a2, a3);
  return r;
}

/* try to set the return value of worker group WG to R, if it has
   not been set */
GLOBAL int st_determine_wg_return_value(worker_group_t wg, void * r)
{
  wg_status_t s = (wg_status_t)st_read_and_lock_int(&wg->status);
  if (s != wg_status_running) {
    /* somebody has already determined the result value */
    st_assert(s == wg_status_exited);
    st_write_and_unlock_int(&wg->status, (int)s);
    fprintf(st_errout, 
	    "%ld st_wg_exit(%lu): somebody has called st_wg_exit or "
	    "the entry function returned\n",
	    tls(worker_id), (unsigned long)r);
    return -1;
  } else {
    /* lock is acquired. I am responsible for setting the result value
       of this group */
#if ST_DBG && 0
    fprintf(st_errout, "worker %d : return value = %d\n", tls(worker_id), r);
#endif
    wg->return_value = r;
    MEMBAR_WRITE_WRITE();
    st_write_and_unlock_int(&wg->status, (int)wg_status_exited);
    return 0;
  }
}

/* suspend all local threads in this stack and resume the toplevel
   idle loop (the resumption point is either PROC_FORK(master_entry(...)), 
   INVALID_PROC_FORK_X(st_schedule_resumed_context(...)), or
   INVALID_PROC_FORK_X(st_restart_context_n(t, iff, 1), iff). */
PRIVATE void kill_local_threads_1(worker_group_t wg)
{
  int n = tls(n_total_threads) - tls(n_resumed_threads);
  struct st_context c[1];
  st_assert(ST_INT_LOC_CHECK(&wg->status, wg_status_exited));
#if ST_DBG && 0
  fprintf(st_errout, "worker %d : kill %d threads\n", tls(worker_id), n);
#endif
  if (n > 0) {
    st_prof_exiting();
    c->valid = 0;
    st_suspend_thread_n(c, n);
  } else {
    st_assert(n == 0);
  }
}

PRIVATE void kill_local_threads()
{
  kill_local_threads_1(tls(wg));
}

typedef struct kill_local_threads_msg
{
  int reply;
} * kill_local_threads_msg_t;

PRIVATE void kill_local_threads_msg_handler(void * a)
{
  kill_local_threads_msg_t m = (kill_local_threads_msg_t)a;
#if ST_DBG && 0
  fprintf(st_errout, 
	  "worker %d : received kill local threads msg\n", tls(worker_id));
#endif
  m->reply = 1;

  /* this procedure is called from handler dispatch code.
     tls(in_handler) flag is unset after this procedure normally returns
     (see st_respond_to_worker_msg), BUT THIS PROCEDURE DOES NOT RETURN!!!
     we unset it here. I do not think of any better way to handle this */
  st_assert(tls(in_handler));
  tls(in_handler) = 0;

  kill_local_threads();
}

PRIVATE void send_kill_local_threads(workers_list_t w)
{
  struct kill_local_threads_msg m_[1];
  struct kill_local_threads_msg volatile * m 
    = (struct kill_local_threads_msg volatile *)m_;
  struct worker_msg wm[1];

  m->reply = 0;
  /* this send cannot fail, so we repeat sending until we succeed */
  while (st_send_worker_generic_msg(w, wm, kill_local_threads_msg_handler, 
				    (void*)m) != 0) {
#if VERBOSE_WS
    fprintf(st_errout, 
	    "%ld : warning : retry to send kill_local msg to %ld\n",
	    tls(worker_id), w->worker_id);
#endif
    st_respond_to_worker_msg_sys(0);
  }
  while (m->reply == 0) {
    st_respond_to_worker_msg_sys(0);
  }
}

PRIVATE void st_wg_exit_1(worker_group_t wg, void * r)
{
  int s = st_determine_wg_return_value(wg, r);
  if (s == 0) {
    /* I am responsible for killing all threads */
    workers_list_t w = st_current_workers_1(wg);
    workers_list_t p;
#if ST_DBG && 0
    fprintf(st_errout, 
	    "worker %d : st_wg_exit kill all threads\n", tls(worker_id));
#endif
    
    for (p = w; p; p = p->next) {
      if (p != tls(worker_cell)) {
#if ST_DBG && 0
	fprintf(st_errout, 
		"worker %d --> worker %d : kill local threads\n",
		tls(worker_id), p->worker_id);
#endif
	send_kill_local_threads(p);
      }
    }
  } else {
#if ST_DBG
    fprintf(st_errout, 
	    "%ld : st_wg_exit does not kill threads\n", tls(worker_id));
#endif
  }
  kill_local_threads_1(wg);
}

PUBLIC void st_wg_exit(void * r)
{
  st_wg_exit_1(tls(wg), r);
}

PUBLIC void st_wg_die(void * r)
{
  st_stack_trace();
  st_wg_exit(r);
}

/* create a group of workers that performs f(a0, a1, a2, a3),
   not waiting for it to complete.
   unlike almost all other procedures in StackThreads, this procedure
   can be called from foreign code. */

PUBLIC void 
stf_create_async_worker_group(worker_group_t wg, /* worker group */
			      worker_group_conf_t wgc, /* configuration */
			      int nw, /* number of initial workers */
			      worker_proc_t f, /* entry procedure */
#if (WORKER_PROC_N_PARAMS == 4)
			      void * a0, void * a1, 
			      void * a2, void * a3
#else
#error change here accordingly when you change WORKER_PROC_N_PARAMS
#endif
			      )
{
  ST_CALLBACK_BEGIN();

  init_worker_group(wg, wgc);
  create_master_worker(wg, nw, f, a0, a1, a2, a3);

  ST_CALLBACK_END();
}

/* wait for WG to exit. return the return value of the worker group */
PUBLIC void * stf_wait_for_wg_to_exit(worker_group_t wg)
{
  ST_CALLBACK_BEGIN();

  void * r;
  st_oneshot_sync_var_wait(wg->done_var);
  r = wg->return_value;

  ST_CALLBACK_END();

  return r;
}

/* 1 if WG has exited */
PUBLIC int stf_did_wg_exit(worker_group_t wg)
{
  ST_CALLBACK_BEGIN();

  int r = st_oneshot_sync_var_signaled(wg->done_var);

  ST_CALLBACK_END();

  return r;
}

PUBLIC void * 
stf_create_sync_worker_group(worker_group_conf_t wgc, /* configuration */
			     int nw, /* number of initial workers */
			     worker_proc_t f, /* entry procedure */
#if (WORKER_PROC_N_PARAMS == 4)
			     
			     void * a0, void * a1, 
			     void * a2, void * a3
#else
#error change here accordingly when you change WORKER_PROC_N_PARAMS
#endif
			     )
{
  ST_CALLBACK_BEGIN();
  void * r;

  struct worker_group wg[1];
  stf_create_async_worker_group(wg, wgc, nw, f, a0, a1, a2, a3);

  r = stf_wait_for_wg_to_exit(wg);  

  ST_CALLBACK_END();

  return r;
}
