/* 
 * st.c
 */

/* 
 * Copyright (c) 1999 by Kenjiro Taura, Akinori Yonezawa. All rights reserved.
 * Copyright (c) 1999 by Yoshihiro Oyama, Toshio Endo. All rights reserved.
 * Copyright (c) 1999 by Kunio Tabata. All rights reserved.
 * Copyright (c) 1999 by Mitsubishi Research Institute.  All rights reserved.
 * Copyright (c) 1999 by Information-technology Promotion Agency.  All rights reserved.
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 */

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>

#include <st.h>
#include "st_int.h"

GLOBAL char * st_malloc(int s)
{
  char * r = (char*)malloc(s);
  return r;
}

GLOBAL void st_free(char * p)
{
  free(p);
}

PUBLIC long st_stack_used_bytes(void)
{
  return SP_DIFF(asm_get_sp(), tls(stack_bottom));
}

PUBLIC void st_app_exit(int code)
{
  exit(code);
}

PUBLIC void st_app_die(int code)
{
  st_stack_trace();
  exit(code);
}

struct thread_shared_storage __tss;
#if TMP
st_int_loc_t __tss_init_state 
= ST_INT_LOC_INITIALIZER(tss_init_state_uninited);
#else
long volatile __tss_init_state = (long)tss_init_state_uninited;
#endif

PRIVATE void init_procedure_information_table(void)
{
  tss(proc_info_table) = 0;
  tss(proc_info_lo) = (uslong)ULONG_MAX;
  tss(proc_info_hi) = 0;
  tss(max_sp_shrink_size) = 0;
  tss(max_sp_relative_size) = 0;
  tss(min_sp_relative_size) = ULONG_MAX;
}

/* basically make a copy of T and let T->next point to OLD. 
   when OLD is zero, we do not allocate T (use T in place).
   it assumes T->next is zero and will not be overwritten. */

PRIVATE st_proc_info_t cons_table(st_proc_info_t t, st_proc_info_t old)
{
  if (old == 0) return t;
  else {
    int skip = 0;
    st_proc_info_t p;
    for (p = old; p; p = p->next) {
      if (strcmp(t->name, p->name) == 0) {
	if (t->begin != p->begin || t->real_epilogue != p->real_epilogue) {
	  fprintf(st_errout, 
		  "fatal error: different entries for the same function %s [%p,%p] and [%p,%p]\n",
		  t->name, (void *)p->begin, (void *)p->real_epilogue, 
		  (void *)t->begin, (void *)t->real_epilogue);
	  st_app_exit(1);
	} else {
	  skip = 1; break;
	}
      }
    }
    if (skip) {
      return old;
    } else {
      st_proc_info_t r 
	= (st_proc_info_t)st_malloc(sizeof(struct st_proc_info));
      if (r == 0) {
	fprintf(st_errout, "cannot allocate proc table entry\n");
	st_app_exit(1);
      }
      bcopy((void*)t, (void*)r, sizeof(struct st_proc_info));
      r->next = old;
      /* make sure ranges in this entry do not overlap */
      {
	st_proc_info_t p;
	uslong b = t->begin;
	uslong e = t->real_epilogue;
	int ok = 1;
	for (p = old; p; p = p->next) {
	  uslong bx = p->begin;
	  uslong ex = p->real_epilogue;
	  /* [b, e] should not overlap [bx, ex] */
	  if (b < bx) {
	    /* then must look like [b, e] [bx, ex] */
	    if (e >= bx) {
	      ok = 0; break;
	    }
	  } else if (b > bx) {
	    /* then must look like [bx, ex] [b, e] */
	    if (ex >= b) {
	      ok = 0; break;
	    }
	  } else {
	    /* b == be, rongai */
	    ok = 0; break;
	  }
	}
	if (ok == 0) {
	  st_assert(strcmp(t->name, p->name));
	  fprintf(st_errout, "fatal error: broken procedure table\n");
	  fprintf(st_errout, "procedure %s [%p,%p] and %s [%p,%p] overlap\n",
		  t->name, (void*)t->begin, (void*)t->real_epilogue, 
		  p->name, (void*)p->begin, (void*)p->real_epilogue);
	  fprintf(st_errout, "this is a known bug of Sun Workshop 5.0 assembler\n");
	  st_app_exit(1);
	}
      }
      return r;
    }
  }
}

/* add pieces of information about procedures.
   usually called just once at the initialization of the program,
   but in principle callable at anytime, so we can potentially 
   support dynamic loading.

   TABLES is an array of array of st_proc_info. The end of the array
   is indicated by zero. the end of each table (table[i]) is indicated by
   table[i].t[n].begin == 0 */

#define PROC_INFO_TABLE_SHIFT 4

/* this is not MT-safe yet */
PUBLIC void st_add_procedure_information_table(st_proc_info_t * tables)
{
  uslong lo = tss(proc_info_lo);
  uslong hi = tss(proc_info_hi);
  uslong max_sp_rel = tss(max_sp_relative_size);
  uslong min_sp_rel = tss(min_sp_relative_size);
  long sp_shr = tss(max_sp_shrink_size);
  int i = 0;
  /* determine the lowest/highest address */
  while(tables[i]) {
    st_proc_info_t t = tables[i];
    int s = 0;
    while(t[s].begin) {
      if (lo > t[s].begin) lo = t[s].begin;
      if (hi < t[s].real_epilogue) hi = t[s].real_epilogue;
      if ((uslong)t[s].sp_relative_size > max_sp_rel) {
	max_sp_rel = t[s].sp_relative_size;
      }
      if ((uslong)t[s].sp_relative_size < min_sp_rel) {
	min_sp_rel = t[s].sp_relative_size;
      }
      if (t[s].max_sp_shrink_size > sp_shr) {
	sp_shr = t[s].max_sp_shrink_size;
      }
      s++;
    }
    i++;
  }

  st_assert(max_sp_rel >= min_sp_rel);
  tss(max_sp_relative_size) = STACK_ALIGN_RAISE(max_sp_rel);
  tss(min_sp_relative_size) = STACK_ALIGN_DROP(min_sp_rel);
  tss(max_sp_shrink_size) = sp_shr;
  if (hi != tss(proc_info_hi) || lo != tss(proc_info_lo)) {
    /* there is at least one additional procedure entry */
    long n_elems = ((hi - lo - 1) >> PROC_INFO_TABLE_SHIFT) + 1;
    int size = sizeof(st_proc_info_t) * n_elems;
    st_proc_info_t * new_table = (st_proc_info_t *)st_malloc(size);
    if (new_table == 0) {
      fprintf(st_errout, "failed to allocate proc table (%d bytes)\n", size);
      st_app_exit(1);
    }
    /* clear table */
    bzero((void *)new_table, size);
    /* copy the old contents into the appropriate position in the new table */
    if (tss(proc_info_table)) {
      st_proc_info_t * old_lo
	= &new_table[(tss(proc_info_lo) - lo) >> PROC_INFO_TABLE_SHIFT];
      st_proc_info_t * old_hi
	= &new_table[(tss(proc_info_hi) - lo) >> PROC_INFO_TABLE_SHIFT];
      bcopy((void*)(tss(proc_info_table)), 
	    (void *)old_lo, (void *)old_hi - (void *)old_lo);
      st_free((char *)tss(proc_info_table));
    }
    /* register new procedures */
    i = 0;
    while(tables[i]) {
      st_proc_info_t t = tables[i];
      int s = 0;
      while (t[s].begin) {
	uslong b = t[s].begin;
	uslong e = t[s].real_epilogue;
	uslong idx;
	for (idx =  ((b - lo) >> PROC_INFO_TABLE_SHIFT);
	     lo + (idx << PROC_INFO_TABLE_SHIFT) < e;
	     idx++) {
	  /* append new entry */
	  st_proc_info_t new_entry = cons_table(t + s, new_table[idx]);
	  new_table[idx] = new_entry;
	}
	s++;
      }
      i++;
    }

    tss(proc_info_table) = new_table;
    tss(proc_info_lo) = lo;
    tss(proc_info_hi) = hi;
  }
}

/* given a program counter PC, return information about the procedure.
   when zero_if_not_found == 1, simply return zero if information is not found
   (this happens when PC points to an address which is not seen by the
   assembly postprocessor). when zero_if_not_found == 0, signal an error
   if information is not found */

PUBLIC st_proc_info_t st_get_proc_info_full(void * pc, int zero_if_not_found,
					    st_proc_info_t child_info)
{
  uslong k = (uslong)pc;

  CHECK_TSS_INIT();

  if (tss(proc_info_lo) <= k && k < tss(proc_info_hi)) {
    uslong idx = (k - tss(proc_info_lo)) >> PROC_INFO_TABLE_SHIFT;
    st_proc_info_t p = tss(proc_info_table)[idx];
    while(p) {
      if (p->begin <= k && k < p->real_epilogue) return p;
      else p = p->next;
    }
  } else {
    if (zero_if_not_found) return 0;
    fprintf(st_errout, 
	    "%ld : we are out of proc_info_table. pc = %p\n", 
	    tls(thread_id), pc);
    st_app_die(1);
  }
  if (zero_if_not_found) return 0;
  else {
    if(child_info) {
      fprintf(st_errout, 
	      "%ld : proc_info_table entry not found for %p (parent of %s)\n", 
	      tls(thread_id), pc, child_info->name);
    } else {
      fprintf(st_errout, 
	      "%ld : proc_info_table entry not found for %p (no parent)\n", 
	      tls(thread_id), pc);
    }
    fprintf(st_errout, "%ld : st_get_proc_info failed\n", tls(thread_id));
    st_app_die(1);
    return 0;
  }
}

/* given an address of a procedure call, return 1 if the call is a fork
   point. return 0 if the call is a normal call. 

   PC is an address of a call instruction (it may not be a return address) */

GLOBAL int st_is_fork_point(void * pc, st_proc_info_t pi)
{
  if (pi == 0) return 0;
  else {
    uslong key = (uslong)pc;
    st_fork_point_info_t fpi = pi->fork_point_info;
    int s = pi->n_fork_points;
    int i;
    
    /* linear search */
    for (i = 0; i < s; i++) {
      if (fpi[i].begin <= key && key < fpi[i].end) 
	return 1;
    }
    return 0;
  }
}

/* PC is an address where a call is made. return 1 if the target is
   invalid */
GLOBAL int st_is_invalid_call_site(void * pc, st_proc_info_t pi)
{
  if (pi == 0) return 0;
  else {
    uslong key = (uslong)pc;
    st_invalid_call_site_info_t icsi = pi->invalid_call_site_info;
    int s = pi->n_invalid_call_sites;
    int i;
    
    /* linear search */
    for (i = 0; i < s; i++) {
      if (icsi[i].begin <= key && key < icsi[i].end) return 1;
    }
    return 0;
  }
}

PRIVATE void init_data_information_table()
{
  tss(data_info_table) = 0;
  tss(n_data_info_table) = 0;
}

PRIVATE void check_data_info_table_sorted(st_data_info_t A, int n)
{
  int i;
  for (i = 0; i < n - 1; i++) {
    if (A[i].addr > A[i + 1].addr) {
      fprintf(st_errout, "bomb a[%d] = %lu > a[%d] = %lu\n", 
	      i, A[i].addr, i + 1, A[i + 1].addr);
      st_app_exit(1);
    }
#if 0
    printf("%s at %p\n", A[i].name, (void *)A[i].addr);
#endif
  }
}

PRIVATE void sort_data_info_table_aux(st_data_info_t A, int a, int c)
{
  /* printf("qs1: A[%d] ... A[%d]\n", a, c); */
  if (c - a < 4) {
    int i;
    for (i = a; i <= c; i++) {
      /* find MIN element in A[i] .. A[c] */
      struct st_data_info t;
      int m = i;
      uslong min = A[i].addr;
      int j;
      for (j = i + 1; j <= c; j++) {
	if (A[j].addr < min) { m = j; min = A[j].addr; }
      }
      /* swap A[i] and A[m] */
      t = A[i];
      A[i] = A[m];
      A[m] = t;
    }
  } else {
    int b = (a + c)/2;
    struct st_data_info p1 = A[a], p2 = A[b], p3 = A[c];
    uslong pivot;
    int p, q;
    if (p1.addr <= p2.addr) { 
      if (p3.addr <= p1.addr) { /* p3 <= p1 <= p2 */
	A[a] = p3; A[b] = p1; A[c] = p2;
	pivot = p1.addr;
      } else if (p3.addr <= p2.addr) { /* p1 < p3 <= p2 */
	A[b] = p3; A[c] = p2;
	pivot = p3.addr;
      } else {			/* p1 <= p2 < p3 */
	/* do nothing */
	pivot = p2.addr;
      }
    } else { 
      if (p3.addr <= p2.addr) {	/* p3 <= p2 < p1 */
	A[a] = p3; A[c] = p1;
	pivot = p2.addr;
      } else if (p3.addr <= p1.addr) { /* p2 <= p3 <= p1 */
	A[a] = p2; A[b] = p3; A[c] = p1;
	pivot = p3.addr;
      } else {			/* p2 < p1 < p3 */
	A[a] = p2; A[b] = p1;
	pivot = p1.addr;
      }
    }

    p = a + 1; q = c - 1;
    while (p <= q) {
      /* safe because A[c] >= pivot */
      while(A[p].addr < pivot) p++;
      /* safe because A[a] <= pivot */
      while(A[q].addr > pivot) q--;
      /* here A[p] >= pivot and A[q] =< pivot */
      if (p <= q) {
	struct st_data_info t = A[p];
	A[p] = A[q];
	A[q] = t;
	p++; q--;
      }
    }
    /* now we have:
       (1) p >= q + 1, (2) for each i < p, A[i] <= pivot, and
       (3) for each i > q, A[i] >= pivot */

    /* check_array(A, a, c, p, q, pivot); */
    
    sort_data_info_table_aux(A, a, p - 1);
    sort_data_info_table_aux(A, p, c);
  }
}

PRIVATE void sort_data_info_table(st_data_info_t A, int n)
{
  sort_data_info_table_aux(A, 0, n - 1);
}

/* this is not MT-safe.
   add <addr, name> tuples in TABLES to the global data information table
   (data_info_table). TABLES is a zero-teminated array of tables, each of which
   is an array of <addr, name> tuples that appear in a single .o file. */

PUBLIC void st_add_data_information_table(st_data_info_t * tables)
{
  int old_n = tss(n_data_info_table);
  st_data_info_t old_table = tss(data_info_table);

  /* count the number of entries */
  int i = 0;
  int n_new_entries = 0;
  while(tables[i]) {		/* for all table in TABLES */
    st_data_info_t t = tables[i];
    int j = 0;
    /* J counts the number of entries in a table */
    while(t[j].addr) {
      j++;
    }
    /* add it in the global sum */
    n_new_entries += j;
    i++;
  }

  {				/* allocate new table */
    int new_n = old_n + n_new_entries; /* number of entries in the new table */
    int old_size = sizeof(struct st_data_info) * old_n;	/* old size in bytes */
    int new_size = sizeof(struct st_data_info) * new_n;	/* new size in bytes */
    st_data_info_t new_table = (st_data_info_t)st_malloc(new_size);
    if (new_table == 0) {
      fprintf(st_errout, "failed to allocate data table (%d bytes)\n", new_size);
      st_app_exit(1);
    }
    /* clear table */
    bzero((void *)new_table, new_size);
    /* copy the old contents into the new table */
    if (old_table) {
      bcopy((void *)old_table, (void *)new_table, old_size);
      st_free((char *)old_table);
    }
    {
      /* write new data */
      int i = 0;
      int n = old_n;		/* destination index in the new table */
      while(tables[i]) {	/* for each table */
	st_data_info_t t = tables[i];
	int j = 0;
	while (t[j].addr) {
	  new_table[n] = t[j];	/* struct copy */
	  n++;
	  j++;
	}
	i++;
      }
    }

    sort_data_info_table(new_table, new_n);
    check_data_info_table_sorted(new_table, new_n);
    tss(data_info_table) = new_table;
    tss(n_data_info_table) = new_n;
  }
}

/* return (a pointer to) <addr, name> tuple such that
   addr <= a < addr_prime, where addr_prime is the next entry of <addr, name>
   in the data info table. */
PUBLIC st_data_info_t st_get_data_info(void * _a)
{
  uslong a = (uslong)_a;
  st_data_info_t table = tss(data_info_table);
  uslong low = 0;
  uslong high = tss(n_data_info_table) - 1;
  if (!(a < table[high].addr + table[high].size)) return 0;
  if (!(table[low].addr <= a)) return 0;
  
  while (high > low + 1) {
    /* invariant */
    int m = (low + high) / 2;
    st_assert(a < table[high].addr + table[high].size);
    st_assert(table[low].addr <= a);
    if (a < table[m].addr) {
      st_assert(m < high);
      high = m;
    } else if (a >= table[m].addr + table[m].size) {
      st_assert(m > low);
      low = m;
    } else {		/* table[m].addr <= a < table[m].addr + table[m].size */
      low = m;
      break;
    }
  }
  if (table[low].addr <= a && a < table[low].addr + table[low].size) {
    return table + low;
  } else {
    st_assert(a < table[low + 1].addr);
    /* in this case, a points to a gap between LOW and the next */
    return 0;
  }
}


/* guess the bottom of the stack by following the FP chain. */

PRIVATE void * guess_stack_bottom(void)
{
  void * this_fp = asm_get_fp(); /* frame pointer of this procedure 
				   (guess_stack_bottom) */
  void * fp = this_fp;		/* frame pointer of the frame being watched */
  void * pc = (void *)guess_stack_bottom; /* program counter being watched */
  void * cfp = 0;		/* frame pointer of the child of FP 
				   (0 if fp == this_fp) */
  int c = 10;			/* the counter to prohibit unwinding stack 
				   too far (just in case) */

  /* get information about the current procedure */
  st_proc_info_t pi = st_get_proc_info(pc, 0, 0);
  while (pi && (c > 0)) {
    /* return address location (i.e., *rap == return address) 
       of the frame currently watched */
    void ** rap = (void **)(fp + pi->return_address_offset);
    /* parent FP pointer location of the frame currently watched*/
    void ** pfpp = (void **)(fp + pi->parent_fp_offset);

    /* they are -1 when this function does not save them at all.
       this should happen only for leaf functions, which should
       never appear in the middle of the stack */
    st_assert(pi->return_address_offset != -1);
    st_assert(pi->parent_fp_offset != -1);

    /* follow the link to the parent */
    cfp = fp;
    pc = *rap;
    fp = *pfpp;

    /* get information of the parent */
    pi = st_get_proc_info(pc - JUMP_AND_LINK_DISPLACEMENT, 1, pi);
    c--;
  } /* end while */
  
  if (pi == 0) {
    /* here, PC was not found in the procedure table. that means, CFPs link to 
       its parent points to a frame which is unknown to us. that is, CFP is 
       presumably the bottom-most frame which we know. */
    st_assert(cfp);
    return cfp;
  } else {
    /* we have followed the link of frames too far */
    fprintf(st_errout, 
	    "guess_stack_bottom could not find entry point of the stack. "
	    "it seems that tls_init is called from a deeply nested point "
	    "or stack has been corrupted\n");
    st_app_die(1);
    return 0;
  }
}

/* data structure for stack trace information  */

typedef struct stack_trace_info 
{
  int depth;
  uslong fp;			/* because we compare them */
  st_proc_info_t pi;
  int exported;
  int forked;			/* 1 if this is forked */
} * stack_trace_info_t;

/* print a piece of stack trace information  */

PRIVATE void print_stack_trace_info (stack_trace_info_t sti)
{
  uslong this_sp = (uslong)asm_get_sp();
  int my_frame_p = 1;
  if ((uslong)sti->fp SP_LT (uslong)tls(stack_bottom)) my_frame_p = 0;
  if ((uslong)sti->fp SP_GEQ this_sp) my_frame_p = 0;

  fprintf(st_errout, "%s %s%2d @ %lu = %lu : %s\n", 
	  (sti->exported ? "xx" : "  "), (my_frame_p ? "-- " : "** "), 
	  sti->depth, sti->fp, sti->fp, sti->pi->name);

  if (sti->forked) {
    fprintf(st_errout, "-----\n");
  }
}

PRIVATE void sort_stack_trace_info(stack_trace_info_t A, int a, int c)
{
  /* printf("qs1: A[%d] ... A[%d]\n", a, c); */
  if (c - a < 4) {
    int i;
    for (i = a; i <= c; i++) {
      /* find MIN element in A[i] .. A[c] */
      struct stack_trace_info t;
      int m = i;
      uslong min = A[i].fp;
      int j;
      for (j = i + 1; j <= c; j++) {
	if (A[j].fp SP_GT min) { m = j; min = A[j].fp; }
      }
      /* swap A[i] and A[m] */
      t = A[i];
      A[i] = A[m];
      A[m] = t;
    }
  } else {
    int b = (a + c)/2;
    struct stack_trace_info p1 = A[a], p2 = A[b], p3 = A[c];
    uslong pivot;
    int p, q;
    if (p1.fp SP_GEQ p2.fp) { 
      if (p3.fp SP_GEQ p1.fp) {	/* p3 <= p1 <= p2 */
	A[a] = p3; A[b] = p1; A[c] = p2;
	pivot = p1.fp;
      } else if (p3.fp SP_GEQ p2.fp) { /* p1 < p3 <= p2 */
	A[b] = p3; A[c] = p2;
	pivot = p3.fp;
      } else {			/* p1 <= p2 < p3 */
	/* do nothing */
	pivot = p2.fp;
      }
    } else { 
      if (p3.fp SP_GEQ p2.fp) {	/* p3 <= p2 < p1 */
	A[a] = p3; A[c] = p1;
	pivot = p2.fp;
      } else if (p3.fp SP_GEQ p1.fp) { /* p2 <= p3 <= p1 */
	A[a] = p2; A[b] = p3; A[c] = p1;
	pivot = p3.fp;
      } else {			/* p2 < p1 < p3 */
	A[a] = p2; A[b] = p1;
	pivot = p1.fp;
      }
    }

    p = a + 1; q = c - 1;
    while (p <= q) {
      /* safe because A[c] >= pivot */
      while(A[p].fp SP_GT pivot) p++;
      /* safe because A[a] <= pivot */
      while(A[q].fp SP_LT pivot) q--;
      /* here A[p] >= pivot and A[q] =< pivot */
      if (p <= q) {
	struct stack_trace_info t = A[p];
	A[p] = A[q];
	A[q] = t;
	p++; q--;
      }
    }
    /* now we have:
       (1) p >= q + 1, (2) for each i < p, A[i] <= pivot, and
       (3) for each i > q, A[i] >= pivot */

    /* check_array(A, a, c, p, q, pivot); */
    
    sort_stack_trace_info(A, a, p - 1);
    sort_stack_trace_info(A, p, c);
  }
}

PRIVATE void check_stack_trace_info_sorted(stack_trace_info_t A, int a, int b)
{
  int i;
  for (i = a; i <= b - 1; i++) {
    if (A[i].fp SP_LT A[i + 1].fp) {
      fprintf(st_errout, "bomb a[%d] = %lu SP_LT a[%d] = %lu\n", 
	      i, A[i].fp, i + 1, A[i + 1].fp);
      st_app_exit(1);
    }
  }
}

#define LEFT_CHILD_IDX(k) (2 * (k) + 1)
#define RIGHT_CHILD_IDX(k) (2 * (k) + 2)

/* return 1 if FP is found in the exported frames heap */
PRIVATE int is_exported_aux(void * fp, export_frame_record_t h, int k, int n)
{
  st_assert(k < n);
  if (h[k].fp == fp) return 1;
  else {
    int l = LEFT_CHILD_IDX(k);
    int r = l + 1;
    if (l < n) {
      if (is_exported_aux(fp, h, l, n)) return 1;
    }
    if (r < n) {
      if (is_exported_aux(fp, h, r, n)) return 1;
    }
    return 0;
  }
}

PRIVATE int is_exported(void * fp)
{
  return is_exported_aux(fp, tls(exported_frames), 0, tls(n_exported_frames));
}

/* starting from start_pc and start_fp, follow the chain of frames until:
   (1) we reach a frame unknown to us,
   (2) MAX_N_FRAMES frames have been examined, or,
   (3) N_THREADS have been examined. 
   when MAX_N_FRAMES == 0, check for (2) is skipped.
   when N_THREADS == 0, check for (3) is skipped.

   write information to TRACE_BUF.
   when SORT == 1, sort examined frames from stack top to bottom.
   when PRINT == 1, print information.
   
   return the number of frames examined if we have examined N_THREADS
   or reached unknown frames (i.e., we have collected information 
   for all requested frames). otherwise return -1.  */

PRIVATE int follow_frame_chain(void * start_pc, void * start_fp,
			       int n_threads, int max_n_frames,
			       int sort, int print,
			       stack_trace_info_t trace_buf)
{
  void * fp = start_fp;		/* frame pointer of the frame being watched */
  void * pc = start_pc;		/* program counter being watched */

  int nt = 0;			/* # of threads encountered */
  int nf = 0;			/* # of frames encountered */

  /* get information about the current procedure */
  st_proc_info_t pi = st_get_proc_info(pc, 1, 0);
  while (pi 
	 && ((n_threads == 0) || (nt < n_threads))
	 && ((max_n_frames == 0) || (nf < max_n_frames))) {
    /* return address location (i.e., *rap == return address) 
       of the frame currently watched */
    void ** rap = (void **)(fp + pi->return_address_offset);
    /* parent FP pointer location of the frame currently watched*/
    void ** pfpp = (void **)(fp + pi->parent_fp_offset);
    int is_fork;

    /* something is wrong if asmpp considered it as a leaf */
    st_assert(pi->return_address_offset != -1);
    st_assert(pi->parent_fp_offset != -1);

    /* write information about current frame */
    trace_buf[nf].depth = nf;
    trace_buf[nf].fp = (uslong)fp;
    trace_buf[nf].pi = pi;
    trace_buf[nf].exported = is_exported(fp);

    /* follow the link to the parent */
    pc = *rap;
    fp = *pfpp;

    /* get information of the parent */
    pi = st_get_proc_info(pc - JUMP_AND_LINK_DISPLACEMENT, 1, pi);
    is_fork = st_is_fork_point(pc - JUMP_AND_LINK_DISPLACEMENT, pi);
    /* the parent called this procedure by fork */
    if (is_fork) {
      trace_buf[nf].forked = 1;
      nt++;
    } else {
      trace_buf[nf].forked = 0;
    }
    nf++;
  } /* end while */
  
  if (sort) {
    sort_stack_trace_info(trace_buf, 0, nf - 1);
    check_stack_trace_info_sorted(trace_buf, 0, nf - 1);
  }

  if (print) {
    int i;
    for (i = 0; i < nf; i++) {
      print_stack_trace_info(trace_buf + i);
    }
    if (pi && (max_n_frames != 0) && (nf == max_n_frames)) {
      /* we have followed the link of frames too far,
	 yet did not find a frame which is unknown to us */
      fprintf(st_errout, "... (more frames)\n");
    }
  }

  if (pi && (max_n_frames != 0) && (nf == max_n_frames)) return -1;
  else return nf;
}

#define MAX_STACK_TRACE_DEPTH 50
/* print stack trace */
PUBLIC void st_stack_trace()
{
  void * sp = asm_get_sp();
  struct stack_trace_info buf[MAX_STACK_TRACE_DEPTH];
  fprintf(st_errout, "thread %ld stack trace: SP = %p = %lu\n", 
	  tls(thread_id), sp, (uslong)sp);
  follow_frame_chain((void *)st_stack_trace, asm_get_fp(),
		     0, MAX_STACK_TRACE_DEPTH, 0, 1, buf);
}

PUBLIC void st_show_context(st_context_t ctxt)
{
  struct stack_trace_info buf[MAX_STACK_TRACE_DEPTH];
  follow_frame_chain(ctxt->fixed_pc, ctxt->fixed_fp,
		     ctxt->n_threads, MAX_STACK_TRACE_DEPTH, 0, 1, buf);
}

/* procedures for keeping track of exported frames */

PRIVATE void downheap(export_frame_record_t h, int k, int n)
{
  while (1) {
    int l = LEFT_CHILD_IDX(k);
    int r = l + 1;
    int c;			/* change */
    /* calculate which child should be exchanged */
    if (l >= n) {
      return;
    } else if (r >= n) {
      c = l;
    } else if ((uslong)h[l].fp SP_LT (uslong)h[r].fp) {
      c = r;
    } else {
      c = l;
    }
    /* see if that node should really be exchanged */
    if ((uslong)h[k].fp SP_LT (uslong)h[c].fp) {
      /* exchange h[k] and h[c] */
      struct export_frame_record tmp = h[k];
      h[k] = h[c];
      h[c] = tmp;
      k = c; 
      continue;
    } else {
      /* done */
      return;
    }
  }
}
  
#define PARENT_IDX(k) (((k) - 1) / 2)
PRIVATE void upheap(export_frame_record_t h, int k)
{
  while (1) {
    int p = PARENT_IDX(k);
    if (p < 0) {
      return;
    } else if ((uslong)h[p].fp SP_LT (uslong)h[k].fp) {
      struct export_frame_record tmp = h[p];
      h[p] = h[k];
      h[k] = tmp;
      k = p;
      continue;
    } else {
      return;
    }
  }
}
  
#define SENTINEL_RA_MAGIC ((void*)100)
#define INIT_N_EXPORT_FRAMES_SIZE 1024

PRIVATE void ensure_exported_frames_size(int n)
{
  int old_n = tls(n_exported_frames);
  export_frame_record_t old_h = tls(exported_frames);
  export_frame_record_t h 
    = (export_frame_record_t)st_malloc(sizeof(struct export_frame_record) * n);
  if (h == 0) {
    fprintf(st_errout, "cannot allocate export frames heap of %d elements\n", n);
    st_app_exit(1);
  }
  /* copy the old contents */
  if (old_h) {
    bcopy((void *)old_h, (void *)h, old_n * sizeof(struct export_frame_record));
    st_free((void *)old_h);
  }
  tls(exported_frames) = h;
  tls(max_n_exported_frames) = n;
}

/* expensive consistency checks for exported frames */
PRIVATE void check_heap_validity_aux(export_frame_record_t h, int k, int n)
{
  int l = LEFT_CHILD_IDX(k);
  int r = l + 1;
  if (l < n) {
    if ((uslong)h[k].fp SP_LT (uslong)h[l].fp) {
#if ST_DBG
      char * k_name = h[k].name;
      char * l_name = h[l].name;
#else  /* ST_DBG */
      char * k_name = "???";
      char * l_name = "???";
#endif /* ST_DBG */
      fprintf(st_errout, 
	      "%ld : invalid export frame heap h[%d].fp == %p (%s) is"
	      " nearer to the bottom than h[%d].fp == %p (%s)\n",
	      tls(thread_id), k, h[k].fp, k_name, l, h[l].fp, l_name);
      st_app_die(1);
    }
  }
  if (r < n) {
    if ((uslong)h[k].fp SP_LT (uslong)h[r].fp) {
#if ST_DBG
      char * k_name = h[k].name;
      char * r_name = h[r].name;
#else  /* ST_DBG */
      char * k_name = "???";
      char * r_name = "???";
#endif /* ST_DBG */
      fprintf(st_errout, 
	      "%ld : invalid export frame heap h[%d].fp == %p (%s) is"
	      " nearer to the bottom than h[%d].fp == %p (%s)\n",
	      tls(thread_id), k, h[k].fp, k_name, r, h[r].fp, r_name);
      st_app_die(1);
    }
  }
  if (l < n) check_heap_validity_aux(h, l, n);
  if (r < n) check_heap_validity_aux(h, r, n);
}

PRIVATE void check_heap_validity(void)
{
  check_heap_validity_aux(tls(exported_frames), 0, tls(n_exported_frames));
}

/* as a sentinel, we put a dummy exported frame that is nearer to the bottom
   than any possible frame */
PRIVATE void init_exported_frames(void)
{
  export_frame_record_t h;
  tls(exported_frames) = 0;
  tls(n_exported_frames) = 0;
  ensure_exported_frames_size(INIT_N_EXPORT_FRAMES_SIZE);

  h = tls(exported_frames);
  h[0].fp = SHRINK_STACK(tls(stack_bottom), 1);
  h[0].rap = tls(dummy_exported_frame_rap);
#if ST_DBG
  h[0].pfpp = tls(dummy_exported_frame_pfpp);
  h[0].name = "(dummy frame)";
#endif /* ST_DBG */
  /* *h[0].rap = REMOVED_RA. this is never removed */
  *h[0].rap = SENTINEL_RA_MAGIC;

  tls(n_exported_frames) = 1;
  tls(fixed_watermark) = h[0].fp;
#if ST_DBG > 1
  check_heap_validity();
#endif /* ST_DBG > 1 */
}

/* remove all frames that are marked as removed */
USED_BY_MACRO void st_remove_exported_frames_aux(void * caller_fp, int in_user)
{
  export_frame_record_t h = tls(exported_frames);
  if (*h[0].rap == REMOVED_RA) {
    int n = tls(n_exported_frames);
    uslong old_wm = (uslong)tls(fixed_watermark);
    
#if ST_PROF
    st_prof_delete_frame();
#endif
    /* remove frames that are marked as removed */
    while(*h[0].rap == REMOVED_RA) {
      st_assert(n > 1);
      /* MAGIC number that indicates the frame has been finished */
      /* move the last element to the first */
      h[0] = h[n - 1];
      /* validate heap */
      downheap(h, 0, n - 1);
#if ST_DBG
      bzero((void*)(h + n - 1), sizeof(struct export_frame_record));
#endif /* ST_DBG */
#if ST_DBG > 1
      check_heap_validity_aux(h, 0, n - 1);
#endif /* ST_DBG > 1 */
      n--;
    }
    /* if there is only one exported frame, this must be the sentinel,
       whose FP is SHRINK_STACK(tls(stack_bottom), 1) and
       whose ra is SENTINEL_RA_MAGIC.
       
       n == 1 ==>    h[0].fp == SHRINK_STACK(tls(stack_bottom), 1) 
       && *h[0].rap == SENTINEL_RA_MAGIC 
       that is,
       not(n == 1) ||    (h[0].fp == SHRINK_STACK(tls(stack_bottom), 1) 
    && *h[0].rap == SENTINEL_RA_MAGIC) */
    
    st_assert((n > 1) || ((h[0].fp == SHRINK_STACK(tls(stack_bottom), 1))
			  && *h[0].rap == SENTINEL_RA_MAGIC));
    
    tls(n_exported_frames) = n;
    tls(fixed_watermark) = h[0].fp;
#if ST_DBG > 1
    check_heap_validity();
#endif /* ST_DBG > 1 */
    
    /* check if we must shrink SP */
    {
      if ((uslong)caller_fp SP_LEQ old_wm) {
	st_proc_info_t pi = 
	  st_get_proc_info((void *)st_remove_exported_frames_aux, 0, 0);
	void * fp = asm_get_fp();
	void * new_sp;
	if ((uslong)caller_fp SP_GT (uslong)tls(fixed_watermark)) {
	  /* the caller frame becomes the top frame */
	  void * ra = *(void**)(fp + pi->return_address_offset);
	  st_proc_info_t caller_pi 
	    = st_get_proc_info(ra - JUMP_AND_LINK_DISPLACEMENT, 0, 0);
	  new_sp = GROW_STACK(caller_fp + caller_pi->base_sp_minus_fp, 
			      tss(max_sp_shrink_size));
	} else {
	  if (h[0].fp == SHRINK_STACK(tls(stack_bottom), 1)) {
	    fprintf(st_errout, "exported frames become empty\n");
	    st_app_die(1);
	  }
	  new_sp = GROW_STACK2(h[0].fp + h[0].base_sp_minus_fp, 
			       (tss(max_sp_relative_size) 
				- tss(min_sp_relative_size)), 
			       tss(max_sp_shrink_size));
	}
#if ST_PROF
	if (in_user) st_prof_busy();
#endif
	asm_set_fp_sp_and_jmp(fp, new_sp, (void *)pi->pure_epilogue);
      } else {
#if ST_PROF
	if (in_user) st_prof_busy();
#endif
      }
    }
  }
}

/* record the fact that a frame pointed to by FP is unwound, thus
   this frame will be executed by an unknown thread in future. 
   when this frame is finished, its return address, which is stored
   at RAP, will be overwritten by an invalid value, so we can tell
   if the frame has been finished or not by checking whether *RAP 
   still holds a valid value */

GLOBAL void st_export_frame(void * fp, void ** rap, long base_sp_minus_fp
#if ST_DBG
			    , void ** pfpp, char * name
#endif /* ST_DBG */
)
{
  export_frame_record_t h;
  int n;

  /* it is so obvious that the current FP is above the watermark.
     we assert this because otherwise we must shrink SP after 
     remove_exported_frames 
  */
  st_assert(asm_get_fp() SP_GEQ tls(fixed_watermark));
  st_remove_exported_frames_sys();
#if ST_PROF
  st_prof_busy();
#endif
  h = tls(exported_frames);
  n = tls(n_exported_frames);

  /* make sure this frame is in the local stack */
  st_assert((uslong)tls(stack_bottom) SP_LEQ (uslong)fp 
	    && (uslong)fp SP_LT (uslong)asm_get_sp());

  if (n >= tls(max_n_exported_frames)) {
    int m = tls(max_n_exported_frames) * 2;
    if (n >= m) m = n * 2;
    ensure_exported_frames_size(m);
    h = tls(exported_frames);
  }
  h[n].fp = fp;
  h[n].rap = rap;
  h[n].base_sp_minus_fp = base_sp_minus_fp;
#if ST_DBG
  h[n].pfpp = pfpp;
  h[n].name = name;
#endif /* ST_DBG */

  upheap(h, n);
  tls(n_exported_frames) = n + 1;
  tls(fixed_watermark) = h[0].fp;
#if ST_DBG > 1
  check_heap_validity();
#endif /* ST_DBG > 1 */
}

PUBLIC void st_show_exported_frames()
{
  export_frame_record_t h = tls(exported_frames);
  int n = tls(n_exported_frames);
  int i;
  fprintf(st_errout, 
	  "%ld : exported frames (watermark = %p, n_exported_frames = %d)"
	  " FP = %p, SP = %p\n", 
	  tls(thread_id), tls(fixed_watermark), n,
	  asm_get_fp(), asm_get_sp());

  for (i = 0; i < n; i++) {
    if (*h[i].rap != REMOVED_RA) {
      fprintf(st_errout, "%ld : frame [%d] = %p (%s) RA = %p @ %p\n", 
	      tls(thread_id), i, h[i].fp, 
#if ST_DBG
	      h[i].name, 
#else  /* ST_DBG */
	      "???", 
#endif /* ST_DBG */
	      *h[i].rap, h[i].rap);
      
#if ST_DBG
      {
	struct stack_trace_info buf[MAX_STACK_TRACE_DEPTH];
	fprintf(st_errout, "%ld : follow chain\n", tls(thread_id));
	follow_frame_chain(*h[i].rap, *h[i].pfpp,
			   0, MAX_STACK_TRACE_DEPTH, 0, 1, buf);
      }
#else
      fprintf(st_errout, "%d : cannot follow chain (ST_DBG == 0)\n", 
	      tls(thread_id));
#endif
    }
  }
}

PUBLIC int st_n_live_exported_frames()
{
  export_frame_record_t h = tls(exported_frames);
  int n = tls(n_exported_frames);
  int c = 0;
  int i;
  for (i = 0; i < n; i++) {
    if (*h[i].rap != REMOVED_RA) {
      c++;
    }
  }
  return c;
}

/* initialize stack local storage (TLS).
   takes two parameters:
   tls_storage := free area for TLS. this must be allocated until the current 
   thread is alive. the typical area is bottom frame of this thread.
   stack_bottom := the bottom of the current stack. */

GLOBAL void st_init_tls(thread_local_storage_t tls_storage)
{
  void * guessed_stack_bottom;
  /* allocate thread local storage.
     and tentatively set tls(fixed_watermark) to somewhere below (closer to 
     the bottom) of any possible stack (when stack grows toward lower 
     address, it can be ULONG_MAX, otherwise zero). */
  __tls = tls_storage;
  tls(fixed_watermark) = BOTTOM_OF_THE_WORLD;

  /* register tls as a thread-specific data of the underlying 
     thread package */
  st_set_tls(tls_storage);

  /* clear everything (mainly for debugging purpose) */
  bzero((void *)__tls, sizeof(struct thread_local_storage));

  /* since we cleared everything in __tls, we again
     set tls(fixed_watermark) to somewhere below (closer to 
     the bottom) of any possible stack (when stack grows toward lower 
     address, it can be ULONG_MAX, otherwise zero). 

     we must tentatively set __tls, tls_fixed_watermark, 
     and tls(fixed_max_sp_shrink_size) BEFORE any return
     from procedure, because epilogue code uses them 
     (tls(fixed_max_sp_shrink_size) is used only when a procedure
     blocks on i386, but placing it here is logical). */

  tls(fixed_watermark) = BOTTOM_OF_THE_WORLD;
  tls(fixed_max_sp_shrink_size) = tss(max_sp_shrink_size);
  guessed_stack_bottom = guess_stack_bottom();

  /* initially, no invalid frames */
  tls(fixed_invalid_frames) = 0;

  /* mark stack bottom */
  tls(stack_bottom) = guessed_stack_bottom;

  /* get thread id of it */
  tls(thread_id) = st_fetch_and_add_long(tss(thread_id_seed), 1);

  init_exported_frames();

  /* initialize pointer to global options */
  tls(gopts) = tss(global_options);

  /* I am not in a handler  */
  tls(in_handler) = 0;

}

/* generated by post processor */
extern st_proc_info_t st_global_proc_info_xxx_[]; 
extern st_data_info_t st_global_data_info_xxx_[]; 

GLOBAL void __st_set_tls_global_ctor()
{
  st_dont_postprocess();
  tss(save_invalid_tls_global_ctor) = __tls;
  __tls = tss(main_tls_storage);
  tls(fixed_watermark) = BOTTOM_OF_THE_WORLD;
  /* initialize tls(fixed_max_sp_shrink_size)? 
     for now we prohibit blocking anyway */
}

GLOBAL void __st_reset_tls_global_ctor()
{
  st_dont_postprocess();
  __tls = tss(save_invalid_tls_global_ctor);
}

GLOBAL int st_atoflag(char * arg)
{
  if (strcmp(arg, "0") == 0) {
    return 0;
  } else if (strcmp(arg, "1") == 0) {
    return 1;
  } else {
    return -1;
  }
}

/* set default global options during st_init_tss.
   when StackThreads applications is launched from our main procedure
   (in ma.c), these values may then be overwritten by ma.c:parse_options.
   otherwise, following default values are used. */

PRIVATE void set_global_options_default(st_global_options_t g)
{
  g->n_workers = atol(ST_GOPT_DEFAULT_N_WORKERS);
  g->stack_size = ST_GOPT_DEFAULT_STACK_SIZE;
  g->steal_wait_limit 
    = atol(ST_GOPT_DEFAULT_STEAL_WAIT_LIMIT);
  g->steal_wait_warn_limit 
    = atol(ST_GOPT_DEFAULT_STEAL_WAIT_WARN_LIMIT);
  g->print_toplevel_worker_stat 
    = st_atoflag(ST_GOPT_DEFAULT_PRINT_TOPLEVEL_WORKER_STAT);
  g->time_profile 
    = st_atoflag(ST_GOPT_DEFAULT_TIME_PROFILE);
  g->time_profile_filename 
    = ST_GOPT_DEFAULT_TIME_PROFILE_FILENAME;
  g->time_profile_resolution 
    = atol(ST_GOPT_DEFAULT_TIME_PROFILE_RESOLUTION);
  g->time_profile_buffer_size 
    = atol(ST_GOPT_DEFAULT_TIME_PROFILE_BUFFER_SIZE);
  g->stack_unwind_optimization 
    = atol(ST_GOPT_DEFAULT_STACK_UNWIND_OPTIMIZATION);
}

USED_BY_MACRO void tss_init_error(void)
{
  fprintf(st_errout, 
	  "runtime error: StackThreads/MP has not been initialized yet.\n"
	  "\tYou are probably running global constructor before main.\n"
	  "\tThis is not supported yet, sorry.\n");
  st_app_die(1);			/* we cannot stacktrace yet */
}


USED_BY_MACRO void * st_init_tss()
{
  int i;
  thread_local_storage_t saved_tls = __tls;
  
  /* check if this is the first time */
  if (ST_INT_LOC_CHECK(&__tss_init_state, tss_init_state_inited)) {
    /* no. tls must have been registered */
    thread_local_storage_t old_tls = st_fix_tls();
    st_assert(old_tls == saved_tls);
    return saved_tls;
  } else {
    struct thread_local_storage tmp_storage[1];
    tss_init_state_t is;
    /* we don't know if __tls have been registered as a thread local value.
       we tentatively set __tls to a temporary storage to make a trivial
       function call (st_spin_lock) work. */
    __tls = tmp_storage;
    tls(fixed_watermark) = BOTTOM_OF_THE_WORLD;
    /* we don't yet know the right value of tss(fixed_max_sp_shrink_size),
       so we cannot initialize it here. fortunately, procedure calls during
       initialization never block, so they do not use it. */
    
    is = (tss_init_state_t)st_read_and_lock_int(&__tss_init_state);
    if (is == tss_init_state_uninited) {
      st_write_and_unlock_int(&__tss_init_state, 
			      tss_init_state_initializing); /* unlock */
      /* this is the first thread that calls st_init_tss.
	 I must be the main thread */
      __tls = tss(main_tls_storage);
      tls(fixed_watermark) = BOTTOM_OF_THE_WORLD;
    } else if (is == tss_init_state_inited) {
      /* restore the saved __tls */
      thread_local_storage_t old_tls;
      __tls = saved_tls;
      st_write_and_unlock_int(&__tss_init_state, is); /* unlock */
      old_tls = st_fix_tls();
      st_assert(old_tls == saved_tls);
      return st_fix_tls();
    } else {
      /* This actually never happens? */
      __tls = saved_tls;
      st_write_and_unlock_int(&__tss_init_state, is); /* unlock */
      fprintf(st_errout, 
	      "error : TSS is being initialized by another thread\n");
      st_app_die(1);
    }
  }

  ASSERT_DATA_SIZES();
  ASSERT_ST_CONTEXT_OFFSETS();
  ASSERT_THREAD_LOCAL_STORAGE_OFFSETS();
  ASSERT_INVALID_FRAME_DESC_OFFSETS();

  /* clear __tss (just in case) and set tls(fixed_watermark) again to 
     make trivial function calls work */
  bzero((void *)&__tss, sizeof(struct thread_shared_storage));
  tls(fixed_watermark) = BOTTOM_OF_THE_WORLD;

  /* create key for get tls and save it in tss(tls_fix)
   */
  make_tls_key();

  /* tss(proc_info_table), tss(proc_info_lo), tss(proc_info_hi),
     tss(max_sp_relative_size), tss(min_sp_relative_size),
     tss(max_sp_shrink_size), */
  init_procedure_information_table();
  /* add stlink-generated table of procedure symbols */
  st_add_procedure_information_table(st_global_proc_info_xxx_);

  init_data_information_table();
  st_add_data_information_table(st_global_data_info_xxx_);

#if EXPLICIT_CONCURRENCY
  /* no unreflected concurrency */
  ST_LONG_LOC_INIT(&tss(unreflected_concurrency), 0);
  st_tl_init(tss(concurrency_lock));
#endif

  /* thread ID counter */
  ST_LONG_LOC_INIT(tss(thread_id_seed), 0);

  /* initialize the region in which worker_wrapper
     reads its arguments */
  for (i = 0; i < N_THR_PROC_WRAPPER_ARGS; i++) {
    ST_INT_LOC_INIT(&tss(thr_proc_wrapper_args)[i].state, 0);
  }

  /* tss(global_options) */
  set_global_options_default(tss(global_options));

  {
    /* say we have done initializing tss */
    tss_init_state_t 
      is = (tss_init_state_t)st_read_and_lock_int(&__tss_init_state);
    st_assert(is == tss_init_state_initializing);
    /* unlock */
    st_write_and_unlock_int(&__tss_init_state, tss_init_state_inited);
  }

  /* initialize my own tls */
  st_init_tls(tss(main_tls_storage));
  /* return the old __tls */
#if 0
  return tss(main_tls_storage);
#else
  return saved_tls;
#endif
}

/* suspend N threads from the top of the stack 
   and make it schedulable via C */
PUBLIC void st_suspend_thread_n(st_context_t c, int n)
{
  st_assert(c->valid == 0);

#if ST_PROF
  st_prof_switch();
#endif
  if (asm_capture_context(c) == 0) {
    c->n_threads = n;		/* C represents N threads */
    tls(n_total_threads) -= n;	/* we lose N threads in this stack */
    tls(thread_blocked) = 1;	/* tells the fork point that the control
				   reaches there because of blocking 
				   (not because of thread finish) */
    st_unwind(n, c);
  } 
}

/* temporaly suspend current thread and make it schedulable again
   (I believe this is useful only for debugging) */
PUBLIC void st_yield()
{
  struct st_context c[1];
  /* the treatment for c->valid field here is very tricky.
     normally, we assign c->valid = 0, suspend thread and capture
     context to C, and later it is resumed by resume_context or
     restart_context. resume_context or restart_context waits until
     c->valid becomes 1, so we fool resume_context by c->valid = 1 here,
     but actually C is still invalid. then we invlidate C and immediately 
     call suspend_thread_n, which assumes c->valid == 0. suspend_thread_n
     fills C with correct values. so there is a duration in which invalid
     context is in the queue of resumed_contexts, but it is OK as long as
     it will not be scheduled by any processor. this is ensured by not
     responding to task steal request meanwhile.
     */
  c->valid = 1;
  c->n_threads = 1;
  st_resume_context(c);
  c->valid = 0;
  st_suspend_thread_n(c, 1);
}

/* make context schedulable (it will be scheduled later).
   simply enque it to the bottom of the stack of resumed contexts */

PUBLIC void st_resume_context(st_context_t c)
{
  struct st_context volatile * c_ = (struct st_context volatile *)c;
  /* enqueue C into the bottom of the stack of resumed contexts */
  st_context_t b = tls(resumed_contexts_bottom);
  c->q_next = 0;
  c->q_prev = b;
  if (b) {			
    /* stack is not empty */
    b->q_next = c;
  } else {
    /* stack is empty */
    tls(resumed_contexts_top) = c;
  }
  tls(resumed_contexts_bottom) = c;

  /* wait until C has been initialized (it is strictly unnecessary 
     because we only read c->n_threads, and we know it becomes 1 someday.
     but we play safe. */
  while (c_->valid == 0);
  MEMBAR_READ_READ();
  /* add number of threads in this processor */
  {				
    short nt = c->n_threads;
    /* it is possible that c->n_threads have not been initialized yet.
       it is OK as long as we do not schedule it for now */
    st_assert(nt == 1);
    tls(n_total_threads) += nt;
    tls(n_resumed_threads) += nt;
  }
}

/* schedule context C. */
GLOBAL void st_schedule_resumed_context(st_context_t c, invalid_frame_desc_t iff)
{
  st_assert(c->valid);
  {
    /* dequeue it from the queue */
    st_context_t p = c->q_prev;
    st_context_t n = c->q_next;
    c->q_prev = 0;
    c->q_next = 0;
    
    /* q_next link:  TOP -> ... -> p -> c -> n -> ... -> BOTTOM */
    if(p) p->q_next = n;
    else tls(resumed_contexts_top) = n;
    
    /* q_prev link:  TOP <- ... <- p <- c <- n <- ... <- BOTTOM */
    if(n) n->q_prev = p;
    else tls(resumed_contexts_bottom) = p;
    
    {
      /* adjust number of threads */
      short nt = c->n_threads;
      st_assert(nt == 1);
      
      /* we lose one thread in the stack of resumed threads */
      tls(n_resumed_threads) -= nt;
      
      /* we lose one thread and immediately gain one thread (by PROC_FORK).
	 the net result is there are no effects on tls_n_total_threads. */
      tls(n_total_threads) -= nt;
      /* discard this frame */
      st_restart_context_n(c, iff, 2);
      /* control never reaches here. */
    }
  }
}

/* tentative alloca support */
char * print_toriaezu_alloca_result(int n, void * r)
{
  /* change this to #if 0 when you do not want to see alloca info */
#if 1			
  static int n_called = 0;
  n_called++;
  if (n_called < 10) {
    printf("alloca(%d) -> %p\n", n, r);
  }
#endif
  return (char *)r;
}

/* normally, call to st_dont_postprocess is removed by the postprocessor.
   however, when --no_postprocess is given in the command line this does
   not happen and calls to st_dont_postprocess may leave. */
int st_dont_postprocess()
{
  return 0;
}
