/*     ^Forth:  Traditional Benchmarks
         File:  gfbench.hf
      Deriver:  david.n.williams@umich.edu
      License:  Public Domain  (We're guessing that includes
                the original benchmark codes.)
Starting date:  March 16, 2001
Version 0.7.0:  March 24, 2001
Version 0.7.1:  August 29, 2001
Version 0.7.2:  October 26, 2002
Version 0.7.3:  November 6, 2002
Version 0.7.4:  November 17, 2002
Version 0.7.5:  November 22, 2002

This is a ^Forth version of the traditional benchmark suite used
in Gforth.  ^Forth is a dialect of Forth rearranged for
automatic translation to C source.  The current translator
produces code for a pfe loadable module.  The main optimizations
are:

  * Word call overhead in the pfe virtual machine, and function
    call overhead as well, are eliminated for words defined as
    ^Forth primitives, which emit in-line C source.

  * Nonprimitive ^Forth words used in ^Forth words can be
    treated by gcc as leaf functions.

  * ^Forth code that combines ^Forth primitives gets some extra
    benefit from gcc optimization.

  * When available, the loop parameters for the currently
    innermost loop are kept in registers.

  * When available, the top entries in the data and floating
    point stacks are kept in registers (TOS and FTOS).

The word CMATRIX-MAIN2 below perhaps best illustrates how ^Forth
allows Forth and C to be mixed in the same definition.  For
examples of ^Forth primitives, see the file kernel.hf.

The point here is to compare run times for Forth code compiled
with normal colon definitions (benchmarked in the usual way),
^Forth code automatically translated to C, and Forth code
translated to C by a human.
*/

BEGIN-EXTERNAL

{"
/* No extra include's. */
"}

REQUIRES hfkernel.hf

\ debug
ext-def: .s "dot_s"

END-EXTERNAL

MAKE-INDEX hfbench "^Forth Benchmarks Word Set"


/* *** siev.fs *** */

\ : SECS TIME&DATE  SWAP 60 * + SWAP 3600 * +  NIP NIP NIP ;

\ CREATE FLAGS 8190 ALLOT
8190 ccreate-allot: flags "flags"
8190 constant: #flags "sharp_flags"  \ for primes up to 16,384

\ FLAGS 8190 + CONSTANT EFLAG
variable: eflag "eflag"

def: primes "primes"  ( -- #primes ) 
  flags #flags one fill  zero three ( #p odd)
  eflag @ flags
  DO ( #p odd) i c@ ( flag)
    IF ( - prime) dup i + ( - i+prime) dup eflag @ <
      IF ( - prime i+prime) eflag @ swap
        DO ( - prime) zero i c! ( - prime) dup +LOOP
      ELSE ( - prime i+prime) drop
      THEN ( #p prime) swap 1+ swap
    THEN ( #p odd) two +
  LOOP ( #p odd) drop
;def

/* Printing version of the above, for debugging. */
def: .primes "dot_primes"  ( -- #primes ) 
  flags #flags + eflag !
  flags #flags one fill  zero three ( #p odd)
  eflag @ flags
  DO ( #p odd) i c@ ( flag)
    IF ( - prime) dup . dup i + ( - i+prime) dup eflag @ <
      IF ( - prime i+prime) eflag @ swap
        DO ( - prime) zero i c! ( - prime) dup +LOOP
      ELSE ( - prime i+prime) drop
      THEN ( #p prime) swap 1+ swap
    THEN ( #p odd) two +
  LOOP ( #p odd) drop
;def

/* This is intended to be a fairly direct translation
   of PRIMES above.
*/
def: cprimes "c_primes"  ( -- #primes )
c{
  int primes = 0, odd = 3;
  char *p = (char *) &flags, *q = (char *) eflag, *r; 
  memset (p, 1, sharp_flags);
  while (p < q){
    if (*p){ /* odd is prime */
/*  HF_PUSH (odd); }c . c{ */ /* print primes for debug */
      r = p + odd;
      if (r < q){
        while (r < q){
          *r = 0;  r += odd;}}
      primes += 1;}
    odd += 2; p++;}
  HF_PUSH (primes);
}c ;def

/* Printing version of the above, for debugging. */
def: .cprimes "dot_c_primes"  ( -- #primes )
  flags #flags + eflag !
c{ {
  int primes = 0, odd = 3;
  char *p = (char *) &flags, *q = (char *) eflag, *r; 

  memset (p, 1, sharp_flags);
  while (p < q){
    if (*p){  /* odd is prime */
    HF_PUSH (odd); }c . c{  /* print primes for debug */
      r = p + odd;
      if (r < q){
        while (r < q){
          *r = 0;  r += odd;}}
      primes += 1;}
    odd += 2; p++;}
  HF_PUSH (primes);
} }c ;def

1000 constant: #iters "sharp_iters"
\ 1 constant: #iters "sharp_iters"  \ debug
def: benchmark "benchmark"
  zero #iters zero DO primes nip LOOP ;def

def: cbenchmark "c_benchmark"
  zero #iters zero DO cprimes nip LOOP ;def

\ SECS BENCHMARK . SECS SWAP - CR . .( secs)
def: sieve-main "sieve_main"
  flags #flags + eflag !
  benchmark ( . ) drop
;def

def: csieve-main "c_sieve_main"
  flags #flags + eflag !
  cbenchmark ( . ) drop
;def


/* *** bubble.fs *** */

\ .( Loading Bubble Sort benchmark...) cr

\ A classical benchmark of an O(n**2) algorithm; Bubble sort
\
\ Part of the programs gathered by John Hennessy for the MIPS
\ RISC project at Stanford. Translated to forth by Marty Fraeman
\ Johns Hopkins University/Applied Physics Laboratory.

\ MM forth2c doesn't have it !
\ : mybounds  over + swap ;
\ We use BOUNDS, ^Forth has it.  No measureable effect with pfe.

\ 1 cells Constant cell  \ present in ^Forth

variable: seed "seed"  ( -- addr)

\ : initiate-seed ( -- )  74755 seed ! ;
\ : random  ( -- n )  seed @ 1309 * 13849 + 65535 and dup seed ! ;

74755 constant: seed0 "seed_zero"
1309  constant: 1309 "thirteen_zero_nine"
13849 constant: 13849 "thirteen_eight_forty_nine"
65535 constant: 65535 "sixty_four_k_minus_one"

def: initiate-seed "initiate_seed"  ( -- )  seed0 seed ! ;def

def: random "random"  ( -- n )
  seed @ 1309 * 13849 + 65535 and dup seed ! ;def

6000 constant: elements "elements"
\ 10 constant: elements "elements"  \ debug

\ align create list elements cells allot
host[ 6000 cells ]host create-allot: list "list"

def: initiate-list "initiate_list"  ( -- )
  list elements cells + list DO random i ! cell +LOOP
;def

def: dump-list "dump_list"  ( -- )
  list elements cells + list DO i @ . cell +LOOP cr
;def

def: verify-list "verify_list"  ( -- )
  list elements 1- cells bounds DO
    i 2@ > abort" bubble-sort: not sorted "
  cell +LOOP
;def

def: bubble "bubble"  ( -- )
\ ." bubbling..." cr
\ one  \ (dnw) This was a bug which had no time effect.
  elements one DO
    list elements i - cells bounds DO
      i 2@ > if i 2@ swap i 2! then
    cell +LOOP 
  LOOP
;def

def: cbubble "c_bubble"  ( -- )
c{
  int i = 1;
  p4cell *p, *q, s0, s1;

  while (i < elements){
    p = (p4cell *) &list;
    q = p + elements - i;
    while (p < q){
      s0 = p[0];  s1 = p[1];
      if (s0 < s1){
        p[0] = s1; p[1] = s0;} 
      p++;}
    i++;}
}c ;def

def: bubble-sort "bubble_sort"  ( -- )
  initiate-seed
  initiate-list
  bubble
  verify-list
;def

def: cbubble-sort "c_bubble_sort"  ( -- )
  initiate-seed
  initiate-list
  cbubble
  verify-list
;def

def: bubble-with-flag "bubble_with_flag"  ( -- )
  one elements one DO
    -one list elements i - cells bounds DO
      i 2@ > if i 2@ swap i 2! drop zero then
    cell +LOOP 
    if leave then
  LOOP
;def
  
def: bubble-sort-with-flag "bubble_sort_with_flag"  ( -- )
  initiate-seed
  initiate-list
  bubble-with-flag
  verify-list
;def

def: bubble-main "bubble_main"  ( -- )
        bubble-sort
\       bubble-sort-with-flag
;def

def: cbubble-main "c_bubble_main"  ( -- )
        cbubble-sort
\       cbubble-sort-with-flag  \ not implemented
;def


/* *** matrix.fs *** */

\ .( Loading Matrix Multiplication benchmark...) cr
\ NOTE: This version needs 0.5MB data space

\ A classical benchmark of an O(n**3) algorithm; Matrix Multiplication
\
\ Part of the programs gathered by John Hennessy for the MIPS
\ RISC project at Stanford. Translated to forth by  Marty Fraeman,
\ Johns Hopkins University/Applied Physics Laboratory.

\ MM forth2c doesn't have it !
\ : mybounds  over + swap ;
\ We use BOUNDS, ^Forth has it.  No measureable effect with pfe.

/* Already defined above. */
\ variable seed
\ : initiate-seed ( -- )  74755 seed ! ;
\ : random  ( -- n )  seed @ 1309 * 13849 + 65535 and dup seed ! ;

host[  \ calculated constants in EXTENSIONS vocabulary
200 constant row-size
\ 10 constant row-size  \ debug
row-size cells constant row-byte-size

row-size row-size * constant mat-size
mat-size cells constant mat-byte-size
 ]host  \ definitions in TRANSLATE vocabulary

host[ row-size ]host      constant: row-size "row_size"
host[ row-byte-size ]host constant: row-byte-size "row_byte_size"
host[ mat-size ]host      constant: mat-size "mat_size"
host[ mat-byte-size ]host constant: mat-byte-size "mat_byte_size"

\ align create ima mat-byte-size allot
\ align create imb mat-byte-size allot
\ align create imr mat-byte-size allot
host[ mat-byte-size ]host create-allot: ima "ima"
host[ mat-byte-size ]host create-allot: imb "imb"
host[ mat-byte-size ]host create-allot: imr "imr"

120 constant: 120 "one_hundred_twenty"
 60 constant:  60 "sixty"

def: initiate-matrix "initiate_matrix"  ( m[row-size][row-size] -- )
  mat-byte-size bounds DO
    random ( r) dup 120 / 120 * - 60 -
    ( r-[r/120]*120-60) i !
  cell +LOOP
;def

def: cinitiate-matrix "c_initiate_matrix"  ( m[row-size][row-size] -- )
c{
  p4cell *q, *p = (p4cell *)TOS;
  p4ucell r = seed;
  HF_DROP1;

  q = p + mat_size;
  while (p < q){ 
    r = (r * 1309 + 13849) & 65535;
    *p++ = r - (r / 120) * 120 - 60;}
  seed = r;
}c ;def

def: innerproduct "innerproduct"  ( a[row][*] b[*][column] -- int)
  zero ( col row sum) row-size zero DO
    ( sum) >r over @ over @ ( col row colval rowval) * r> +
    ( sum) >r
    ( col row) swap cell+ swap row-byte-size +
    r> ( col row sum) 
  LOOP
  >r 2drop r> ( sum)
;def

def: cinnerproduct "c_innerproduct"  ( a[row][*] b[*][column] -- int)
c{
  p4cell *row = (p4cell *) TOS;
  p4cell *col = (p4cell *) SP[1]; 
  p4cell i = 0, sum = 0;
  SP++;
  
  while (i++ < row_size){
    sum += *row * *col++;
    row += row_size;}
  TOS = sum;
}c ;def

def: matrix-main "matrix_main"  ( -- )
  initiate-seed
  ima initiate-matrix
  imb initiate-matrix 
  imr ima mat-byte-size bounds DO
    imb row-byte-size bounds DO
      j ( left.row) i ( right.col) innerproduct over ! cell+ 
    cell +LOOP
  row-size cells +LOOP
  drop
;def

def: cmatrix-main1 "c_matrix_main_one"  ( -- )
  initiate-seed
  ima cinitiate-matrix
  imb cinitiate-matrix
  imr ima mat-byte-size bounds DO
    imb row-byte-size bounds DO
      j i cinnerproduct over ! cell+ 
    cell +LOOP
  row-size cells +LOOP
  drop
;def

/* This gives a run time not measureably different from the
   above in our system.
*/
def: cmatrix-main2 "c_matrix_main_two"  ( -- )
c{
  p4cell *row = ima, *col = imb, *result = imr;
  p4cell *row_lim = row + mat_size;
  p4cell *col_lim = col + row_size; 
}c  
  initiate-seed
  ima cinitiate-matrix
  imb cinitiate-matrix
c{
  while (row < row_lim){ /* rows in left matrix */
    col = imb;
    while (col < col_lim){  /* cols in right matrix */
      HF_FILL_TOS SP -= 2;
      (p4cell *)SP[1] = row;  (p4cell *) TOS = col++;
      }c cinnerproduct c{
      *result++ = TOS; SP++;}
    row += row_size;}
}c ;def

def: dump-a-matrix "dump_a_matrix"  ( -- )
  ima mat-byte-size bounds DO i @ . cell +LOOP ;def

def: dump-b-matrix "dump_b_matrix"  ( -- )
  imb mat-byte-size bounds DO i @ . cell +LOOP ;def

def: dump-result-matrix "dump_result_matrix"  ( -- )
  imr mat-byte-size bounds DO i @ . cell +LOOP ;def


/* *** fib.fs *** */

def: fib "fib"  ( n -- f[n+1] )
  dup two < IF
    drop one
  ELSE
    dup
    1- RECURSE
    swap two - RECURSE
    +
  THEN ;def

def: cfib "c_fib"  ( n -- f[n+1] )
c{
  int n = TOS, f;

  if (n < 2){
    TOS = 1;
  }else{
    TOS = n - 1;  hf_c_fib_ ();  f = TOS;
    TOS = n - 2;  hf_c_fib_ ();  TOS += f;}
}c ;def

34 constant: 34 "thirty_four"

def: fib-main "fib_main"  34 fib drop ;def
def: cfib-main "c_fib_main" 34 cfib drop ;def