/**
 *         Title:  Parsing ANS Forth strings and
 *                 the input stream
 *          File:  parsing-ext.c
 *        Author:  David N. Williams
 *       License:  LGPL
 *       Version:  1.0.0
 * Last revision:  October 26, 2010
 *
 * Some of this file is derived from Wil Baden's ToolBelt, which
 * we believe to be in the public domain.  We have borrowed
 * freely from the pfe implementation of the ToolBelt.  For the
 * sake of the LGPL, the rest is
 *
 * Copyright (C) 1996-2008, 2010 by David N. Williams
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later
 * version.
 *
 * This library is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied
 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE.  See the GNU Library General Public License for moref
 * details.
 *
 * You should have received a copy of the GNU Lesser General
 * Public License along with this library; if not, write to the
 * Free Software Foundation, 59 Temple Place, Suite 330, Boston,
 * MA 02111-1307 USA.
 *
 * If you take advantage of the option in the LGPL to put a
 * particular version of this library under the GPL, the author
 * would regard it as polite if you would put any direct
 * modifications under the LGPL as well, and include a copy of
 * this request near the beginning of the modified library
 * source.  A "direct modification" is one that enhances or
 * extends the library in line with its original concept, as
 * opposed to developing a distinct application or library which
 * might use it.
 */


/**********
  Preamble   
 **********/

/**
 * This is an external pfe module, not part of the pfe
 * distribution.
 *
 * This extension word set is for parsing ANS Forth strings, the
 * parse area of the input stream, and the input stream across
 * lines or blocks.  Some of it is based on ^Forth Motorola
 * 680x0 parsing words going back to before 1996.  Some is based
 * on Wil Baden's ToolBelt 2000 or 2002, with a few name
 * changes.
 *
 * See the file parsing.fs for an ANS Forth implementation, and
 * the file parsing-test.fs for Hayes tests.
 *
 * Since they are based on REFILL, words that act across lines
 * are expected to work across blocks as well, but have not been
 * tested with blocks.
 *
 * To be safe for all input sources, it's important that the
 * parse area be accessed only through p4_source() and TO_IN.
 * Unfortunately, the code in this file uses TIB  and NUMBER_TIB
 * as well.  That will be fixed in a later version.
 */

#define _P4_SOURCE 1
#include <pfe/pfe-base.h>
 
#include <string.h>  /* memcmp() */
#include <ctype.h>   /* toupper(), ispunct() */

/* We use the prefix "dw_" to avoid possible label conflicts
 * with pfe prefixes.
 */
#include "parsing-ext.h"

static char libid[] = "parsing-ext pfe module dnw-1.0.0";

/**
 * PARSING-LIB-S  ( -- ident.s )
 */
FCode (dw_parsing_lib_s)
{

  *--SP = (p4cell) libid;  *--SP = strlen( libid );
}


/*******************
  Table of Contents
 *******************/

/***
 * 0 Notation and Terminology
 * 1 ANS Forth Strings
 * 1.1 ToolBelt 2002
 * 1.2 ToolBelt 2000
 * 1.3 Other String Extensions
 *
 * 2 Input Stream
 * 2.1 Parse Area
 * 2.2 Across Lines or Blocks
 */


/****************************
  0 Notation and Terminology
 ****************************/

/**
 * c[n]:  The number of address units for n characters.  The
 *   result of ( n) CHARS.
 *
 * s, or .s suffix:  Short for the pair representation
 *   [addr len), an fstring.
 *
 * x.len:  The len part of x = (addr len).
 * x.addr: The addr part of x = (addr len).
 *
 * s = s', or "s is the same as s'":  If s is represented by
 *   (addr len) and s' by (addr' len'), then addr = addr' and
 *   len = len'.  That is, the two fstrings have the same
 *   representation.
 *
 * s' is a substring of s:  addr <= addr' <= addr+c[len] and
 *   addr'+c[len'] <= addr+c[len].
 *
 * s is equivalent to s':  The lengths of s and s' are the same.
 *   The addresses of the string bodies need not be the same,
 *   but if the lengths are nonzero, the bodies have the same
 *   sequence of characters.
 *
 * right empty string:  For s = (addr len), the right empty
 *   string is (addr+c[len] 0), just off its end.  According to
 *   the definition of a substring, the right empty string of s
 *   is a substring of s, even though there is no character of s
 *   at its address.  That includes the case where s is empty.
 *
 * left empty string:  For s = (addr len), the left empty string
 *   is (addr 0), at its beginning.  According to the definition
 *   of a substring, the left empty string is a substring of s,
 *   although there is no character of s at its address.  That
 *   includes the case where s is empty.
 *
 *   Certain specifications in the ToolBelt imply one of these
 *   when an empty string results.  Our specs could have been
 *   written to be independent of such implementation details,
 *   but we find it clearer and simpler to adopt the same
 *   practice.
 *
 * whitespace:  In this implementation, any of the ASCII
 *   characters 0x0 ... 0x20.
 *
 * word in s:  A substring of s satisfying all of the following:
 *   1. it is not empty;
 *   2. it contains no whitespace characters;
 *   3. it is preceded by a whitespace character or is at the
 *      beginning of s;
 *   4. it is followed by a whitespace character or is at the
 *      end of s.
 *
 * "<name>":  In stack comments, the next word in the parse
 *   area, or the next word in the input stream if the comment
 *   applies across lines.
 *
 * cbuf, #cbuf:  In stack comments cbuf is the address of a
 *   character aligned buffer.  The number of characters the
 *   buffer can hold is #cbuf.
 */


/*********************
  1 ANS Forth Strings
 *********************/

/**
 * ANS Forth strings are represented on the data stack by an
 * (addr len) pair, where len is the number of characters.
 */

#define ISWHITE( C )  ( (unsigned char) C <= ' ' ) 
#define WHITE_Q( C )  ( ISWHITE( C )  ? ~0 : 0 )
                     
/*******************
  1.1 ToolBelt 2002
 *******************/

/**
 * "ToolBelt 2002" refers to the 2003-02-25 revision.
 */

/**
 * STRING/  ( addr len i -- addr+c[len-i] i )
 *
 * In the "normal case", where i is nonnegative and len-i is
 * nonnegative, the output string is the last i characters of
 * the input string. <ansref>"string-slash"</ansref>
 */
FCode (dw_string_slash)
{
  p4cell i = SP[0], len = SP[1];

  SP += 1;
  SP[1] = (p4cell) ( (p4char *) SP[1] + len - i );
  SP[0] = i;
}

/**
 * END-C@  ( s -- char )
 *
 * Assume the input string is not empty.  Leave its last
 * character.
*/
FCode (dw_end_c_fetch)
{
  p4cell len = SP[0];
  p4char *addr = (p4char*) SP[1];

  *++SP = (p4char) *(addr + len - 1);
}

/**
 * SKIP  ( addr len char -- addr+c[i] len-i )
 *
 * Skip leading instances of char.  When char is BL, any leading
 * whitespace char is skipped.  <ansref>"skip"</ansref>
 *
 * NOTE:  Based on Guido Draheim's pfe implementation of SKIP
 * and BL-SKIP.
 */
FCode (dw_skip)
{
  p4char *p = (p4char *) SP[2];
  p4cell n = SP[1];
  p4char c = (p4char) *SP++;

  if ( c == ' ' )
  {  while ( n && ISWHITE (*p) ) {n--; p++;} }
  else
  { while ( n && *p == c ) {n--; p++;} }
  SP[1] = (p4cell) p;
  SP[0] = n;
}

/**
 * SCAN  ( str len char -- str+c[i] len-i )
 *
 * Scan the string for the first occurrence of char.  If found,
 * leave the rest of the string, starting at char.  If not found
 * and if the input string is empty, leave it unchanged;
 * otherwise leave zero length and the address just after the
 * end of the string.  When char is BL it is matched by any
 * whitespace.  <ansref>"scan"</ansref>
 * 
 * NOTE:  Based on Guido Draheim's pfe implementation of SCAN
 * and BL-SCAN.
 */
FCode (dw_scan)			
{				
  p4char *p = (p4char *) SP[2];
  p4cell n = SP[1];
  p4char c = (p4char) *SP++;

  if (c != ' ')
  { while ( n && *p != c ) { n--; p++; } } 
  else
  { while ( n && !ISWHITE (*p) ) { n--; p++; } }

  SP[0] = n; SP[1] = (p4cell) p;
}

/**
 * scan-back  ( addr len char -- addr len-i )	2002 BACK
 *
 * Scan the input string backwards for char, or whitespace if
 * char is BL.  The output string either ends with the first
 * backward occurrence of char, respectively, whitespace, or is
 * the left empty string.
 *
 * Note:  We prefer the name SCAN-BACK to the ToolBelt's BACK,
 * because with no qualification, BACK could just as well mean
 * SKIP-BACK.
 */
FCode (dw_scan_back)
{
  p4cell n = SP[1];
  p4char *p = (p4char *) SP[2] + n;
  p4char c = (p4char) *SP++;

  if ( c == ' ' )
  { while ( n && !ISWHITE (*--p) ) n--; }
  else
  { while ( n && *--p != c ) n--; }

  *SP = n;
}

/**
 * WHITE?     ( char -- flag )	2002 Is-White?
 */
FCode (dw_white_Q)
{
  *SP = WHITE_Q ( *SP );
}

/**
 * TRIM|  ( addr len -- addr len-i )	20002 TRIM
 *
 * Trim trailing whitespace from the input string.  If the input
 * is all whitespace, the result is the left empty string.
 *
 * Note:  The ToolBelt name is TRIM (the same in 2000 and 2002),
 * which it seems to us would more naturally mean to remove both
 * leading and trailing whitespace.
 */
FCode (dw_trim_bar)
{
   *--SP = ' ';  FX (dw_skip_back);
}

/*******************
  1.2 ToolBelt 2000
 *******************/

/**
 * /SPLIT  ( addr len addr+c[i] len-i -- addr+c[i] len-i addr i )
 *
 * Split the character string (addr len) at the place given by
 * (addr+c[i] len-i].  Wil Baden's "cut-split".  ToolBelt 2002
 * makes this a deprecated synonym for CHOP.
 * <ansref>"cut-split"</ansref>
 */
FCode (dw_cut_split)
{
  p4cell len = SP[2], lenp = SP[0]; 
  p4char *addr = (p4char*) SP[3], *addrp = (p4char*) SP[1];

  SP[0] = len - lenp;
  SP[1] = (p4cell) addr;
  SP[2] = lenp;
  SP[3] = (p4cell) addrp;
}

/**
 * S-STARTS  ( s pat.s  -- s flag )	2000 STARTS?
 *
 * Leave true if the string s starts with pat.s.  From Wil
 * Baden's tool belt.  We use the pfe implementation from
 * toolbelt-ext.c.  <ansref>"starts-question"</ansref>
 *
 * NOTE:  The flag is true if pat.s is empty or if both s and
 * pat.s are empty.
 */
FCode (dw_s_starts)
{
  p4cell n = *SP++;

  if ( SP[1] < n || memcmp ((char*) SP[2], (char*) SP[0], n) ) 
      *SP = 0;
  else *SP = ~0;
}

/**
 * S-STARTS-NC  ( s uc.pat.s  -- s flag )
 *
 * Leave true if the string s starts with uc.pat.s.  The pattern
 * string is assumed to be all upper case.  Characters at the
 * beginning of s are forced to upper case before comparison,
 * without changing s.  <ansref>"starts-n-c-question"</ansref>
 *
 * NOTE:  The flag is true if pat.s is empty or if both s and
 * pat.s are empty.
 */
FCode (dw_s_starts_nc)
{
  p4cell n = *SP++;
  char *p = (char *) SP[2], *q = (char *) *SP;

  if (SP[1] < n) { *SP = 0; return; }
  while (n)
  {
    if (*q != toupper (*p)) break;
    p++;  q++; n--;
  }

  *SP = n ? 0 : ~0;
}
 
/**
 * S-ENDS?  ( s pat.s -- s flag )	2000 ENDS?
 *
 * Leave true if the string s ends with pat.s.  From Wil Baden's
 * tool belt.  We use the pfe implementation from
 * toolbelt-ext.c. <ansref>"ends-question"</ansref>
 */
FCode (dw_s_ends)
{
  p4cell n = *SP++;

  if ( SP[1] < n ||
       memcmp ((char*) SP[2] + (SP[1] - n), (char*) SP[0], n) )
       *SP = 0;
  else *SP = ~0;
}

/**
 * S-ENDS-NC  ( s uc.pat.s  -- s flag )
 *
 * Leave true if the string s ends with uc.pat.s.  The pattern
 * string is assumed to be all upper case.  Characters at the
 * end of s are forced to upper case before comparison, without
 * changing s. <ansref>"ends-n-c-question"</ansref>
 *
 * NOTE:  The flag is true if pat.s is empty or if both s and
 * pat.s are empty.
 */
FCode (dw_s_ends_nc)
{
  p4cell n = *SP++;
  p4cell l = SP[1];
  char *p = (char *) SP[2] + l - n, *q = (char *) *SP;

  if (SP[1] < n) { *SP = 0; return; }

  while (n)
  {
    if (*q != toupper (*p)) break;
    p++;  q++; n--;
  }

  *SP = n ? 0 : ~0;
}

/*****************************
  1.3 Other String Extensions
 *****************************/

/**
 * All "{}" words treat BL as a literal character, not as
 * shorthand for white-space.
 */

/**
 * The following four words are implemented in this library as
 * aliases.  They assume that len and the output length are
 * nonnegative.
 *
 * : CUT-FIRST  ( addr len i -- addr+c[i] len-i ) /string ;
 * : CUT-LAST   ( addr len i -- addr len-i )      - ;
 * : KEEP-FIRST ( addr len i -- addr i )          nip ;
 * : KEEP-LAST  ( addr len i -- addr+c[len-i] i ) string/ ;
 */

/**
 * SKIP-BACK  ( addr len char -- addr len-i | empty.s )
 *
 * Trim trailing instances of char from the input string, or of
 * whitespace if char is BL.  TRIM from the ToolBelt 2002 is
 * essentially the same as BL SKIP-BACK, which is the same as
 * the ANS word -TRAILING, up to implementation dependence on
 * how -TRAILING treats white space.
 *
 * Note:  This implementation produces empty.s = (addr 0).
 */
FCode (dw_skip_back)
{
  p4cell n = SP[1];
  p4char *p = (p4char *) SP[2] + n;
  p4char c = (p4char) *SP++;

  if ( n ) p-- ;

  if ( c == ' ' )
  { while ( n && ISWHITE (*p) ) { n--; --p; } }
  else
  { while ( n && *p == c ) { n--; --p; } }

  *SP = n;
}

/**
 * |TRIM  ( s -- s' )
 *
 * Leave the fstring s' resulting from trimming leading
 * whitespace from s.  If s is all whitespace or empty, s' is
 * its right empty
 * string.
 */
FCode (dw_bar_trim)
{
  *--SP = ' ';  FX (dw_skip);
}

/**
 * |TRIM|  ( s -- s' )
 *
 * Leave the fstring s' resulting from trimming leading and
 * trailing whitespace from s.  If s is all whitespace or empty,
 * s' is its right empty string.
 */
FCode (dw_bar_trim_bar)
{
  *--SP = ' ';  FX (dw_skip);  *--SP = ' ';  FX (dw_skip_back);
}

/**
 * SEEK  ( addr len char -- addr+c[i] len-i found? )
 *
 * Look for char in the input string, or whitespace if char is
 * BL. If the character is found, the output string begins just
 * after its first occurrence, or is the right empty string, and
 * found? is true.  Else the output string is the right empty
 * string and found? is false.  <ansref>"seek"</ansref>
 *
 * : seek ( ... -- ... ) scan dup IF 1 /string true ELSE false THEN ;
 */
FCode (dw_seek)			
{				
  p4char *p = (p4char *) SP[2];
  p4cell n = SP[1], found;
  p4char c = (p4char) SP[0];

  if (c != ' ')
  { while ( n && *p != c ) { n--; p++; } } 
  else
  { while ( n && !ISWHITE (*p) ) { n--; p++; } }

  if (n) { n--; p++; found = ~0; } else { found = 0; }

  SP[0] = found; SP[1] = n; SP[2] = (p4cell) p;
}

/**
 * {}IN  ( char 'chars -- char|0 )
 *
 * If the character is in the null-terminated list of
 * characters, leave it.  Else leave 0.
 * <ansref>"set-in"</ansref>
 */
FCode (dw_set_in)
{
  p4char *p = (p4char *) *SP++, c = *SP, elem;
  int i = 0; 
 
  while ( (elem = *p++) && (elem != c) ) i++ ;
  *SP = elem;
}

/**
 * {}SCAN  ( s &chars -- s' )
 *
 * Leave the substring s' of s starting at the first occurrence
 * of one of the characters in the null-terminated string
 * starting at &chars.  If no such character is found and s is
 * not empty, s' is empty.  If s is empty, s' is s.
 *
 * If BL is in the set of delimiters, it is treated literally,
 * not as a shorthand for whitespace.
 *
 * In this implementation, when empty, s' is the right empty
 * string.
 */
FCode (dw_set_scan)
{
  p4char c, d, *dels = (p4char *) *SP++, *p = (p4char *) SP[1];
  p4cell j, n = SP[0];

  while ( n != 0 )
  {
    n--;
    c = *p++;
    j =  0;
    while ( (d = dels[j++]) != 0 )
    {
      if ( c == d ) { n++; p--; goto push_result; }
    }
  }

push_result:
  SP[0] = n ; SP[1] = (p4ucell) p;
}

/**
 * {}SEEK  ( s &chars -- s' char|0 )
 *
 * Scan s for the first occurrence of any of the characters in
 * the null-terminated string starting at &chars.  If the
 * character is found, leave it, with s' the part of s that
 * follows it, or the right empty string if it is found at the
 * end of s.  If s is empty, leave s and zero.  If s is nonempty
 * and the character is not found, leave zero and the right
 * empty string.
 */
FCode (dw_set_seek)
{
  p4char c, d, *dels = (p4char *) SP[0], *p = (p4char *) SP[2];
  p4cell n = SP[1], i = n, j;

  while ( i != 0 )
  {
    i--;
    c = *p++;
    j =  0;
    while ( (d = dels[j++]) != 0 )
    {
      if ( c == d ) { goto push_result; }
    }
  }

  d = 0;

push_result:
  SP[0] = d; SP[1] = i; SP[2] = (p4ucell) p;
}

/**
 * {}INDEX  ( char 'chars -- index|-1 )
 *
 * If char is in the null-terminated list of characters at
 * &chars, leave its index.  Else leave -1.  The first character
 * has index zero.  <ansref>"set-index"</ansref>
 */
FCode (dw_set_index)
{
  p4char *p = (p4char *) *SP++, c = SP[0], elem;
  p4cell i = 0; 
 
  while ( (elem = *p++) && (elem != c) ) i++ ;
  *SP = elem ?  i :  -1 ;
}

/**
 * {}STARTS  ( s 'chars -- s char|0 )
 *
 * If the string is not empty and its first character is in the
 * null-terminated list of characters, leave the character and
 * the original string.  Else leave zero and the original
 * string.  <ansref>"set-starts-question"</ansref>
 */
FCode (dw_set_starts)
{
  p4char c, d, *dels = (p4char *) SP[0];
  
  if (!SP[1]) { *SP = 0 ; return; }

  c = *(p4char *) SP[2];
  while ( (d = *dels++) && (d != c) );
  *SP = (p4cell) d;  /* zero if not found */
}

/**
 * {}ENDS  ( s 'chars -- s char|0 )
 *
 * If the string is not empty and its last character is in the
 * null-terminated list of characters, leave the character and
 * the original string.  Else leave zero and the original
 * string.  <ansref>"set-starts-question"</ansref>
 */
FCode (dw_set_ends)
{
  p4char c, d, *dels = (p4char *) SP[0];
  
  if (!SP[1]) { *SP = 0 ; return; }

  c = *( (p4char *) SP[2] + SP[1] - 1 );
  while ( (d = *dels++) && (d != c) );
  *SP = (p4cell) d;  /* zero if not found */
}

/**
 * SEPARATE  ( s char -- after.s before.s true | s false )
 *
 * Scan s for the first occurrence of char, or whitespace in
 * case char is BL.  If found, leave true and the strings after
 * and before char.  If not found, leave s and false. 
 * <ansref>"separate"</ansref>
 */
FCode (dw_separate)
{
  p4char *p = (p4char *) SP[2];
  p4char  c = (p4ucell) SP[0];
  p4ucell i = 0, n = (p4ucell) SP[1];

  if (c == ' ')
  { while ( (i < n) && !ISWHITE (p[i]) ) i++ ; }
  else
  { while (i < n && p[i] != c) i++; }

  if (i == n)
    *SP = 0;
  else
  {
    SP -= 2;
    SP[0] = ~0;
    SP[1] = i;  SP[2] = SP[4];
    SP[3] = n - i - 1;  SP[4] = (p4cell) (p + i + 1);
  }
}

/**
 * {}SEPARATE  ( s 'delims -- after.s before.s found.char | s false )
 *
 * Scan s for the first occurrence of any of the null-terminated
 * sequence of characters at 'delims as delimiter.  Bl is
 * treated literally, not as shorthand for whitespace. If a
 * delimiting character is found, leave it as found.char, and
 * leave the strings after and before.  If a delimiting
 * character is not found, leave zero instead of found.char and
 * the original string. <ansref>"set-separate"</ansref>
 */
FCode (dw_set_separate)
{
  p4char c, d, *dels = (p4char *) SP[0];
  p4ucell i = 0, j, n = (p4ucell) SP[1];
  p4char *p = (p4char *) SP[2];

  while ( i < n )
  {
    c = p[i++];
    j =  0;
    while ( (d = dels[j++]) != 0 )
    { if ( c == d ) goto found; }
  }

  *SP = 0 ; return;

found:
  SP -= 2;
  *SP = c;
  SP[1] = i - 1;  SP[2] = SP[4];
  SP[3] = n - i;  SP[4] = (p4cell) (p + i);
}

/**
 * S-SEPARATE  ( s pat.s -- after.s before.s true | s false )
 *
 * Scan s for the first occurrence of the pattern string.  If
 * found, leave true and the strings after and before the
 * pattern.  If not, leave false and the original string.
 * <ansref>"s-separate"</ansref>
 */
FCode (dw_s_separate)
{
  /* p4_search() uses char, not p4char */
  const char *p1 = (const char *) SP[3];
  p4cell l1 = SP[2], l2 = *SP;  
  const char *p =
    p4_search (p1, l1, (char *)SP[1], l2);

  if (p == NULL) *++SP = 0;
  else
  {
    *--SP = ~0;
    SP[1] = p - p1;            SP[2] = (p4cell) p1;
    SP[3] = p1 - p + l1 - l2;  SP[4] = (p4cell) (p + l2); 
  }
}

/**
 * S-AFTER  ( s s' -- after.s )
 *
 * Assume s' to be a substring of s.  Leave after.s, the
 * substring of s that follows s'. <ansref>"s-after"</ansref>
 */
FCode (dw_s_after)
{
  p4cell len = SP[2], lenp = SP[0];
  p4char *addr = (p4char*) SP[3], *addrp = (p4char*) SP[1];

  SP += 2;
  SP[0] = len - lenp - (addrp - addr);
  SP[1] = (p4cell) (addrp + lenp);
}

/**
 * FIRST-WORD  ( s -- word.s | right.empty.s )
 *
 * Leave the first word in s.  If there is none, leave the right
 * empty string.  <ansref>"first-word"</ansref>
 */
FCode (dw_first_word)
{
  p4ucell l = (p4ucell) SP[0], lrem;
  p4char *p = (p4char *) SP[1];

  while ( l &&  ISWHITE (*p) ) { l--; p++; }
  SP[1] = (p4cell) p;
  lrem = l;
  while ( l && !ISWHITE (*p) ) { l--; p++; }
  SP[0] = lrem - l;
}

/**
 * SEPARATE-WORD  ( s -- after.s word.s )
 *
 * Leave word.s as a substring of s, the first word in s.  Leave
 * after.s as the substring of s that follows the word, after
 * skipping the first of any whitespace characters that trail
 * the word.  If after.s or word.s is empty, it is the right
 * empty string.  <ansref>"separate-word"</ansref>
 */
FCode (dw_separate_word)
{
  p4ucell l = (p4ucell) SP[0], lrem;
  p4char *p = (p4char *) SP[1];

  SP -= 2; 
  while ( l &&  ISWHITE (*p) ) { l--; p++; }
  SP[1] = (p4cell) p;
  lrem = l;
  while ( l && !ISWHITE (*p) ) { l--; p++; }
  SP[0] = lrem - l;

  if ( l != 0 ) { p++; l--; } 
  SP[2] = l; SP[3] = (p4cell) p;
}


/****************
  2 Input Stream
 ****************/

/****************
  2.1 Parse Area
 ****************/

_export void
dw_parse_area (const p4_char_t **p, p4cell *n)
{
  p4_source (p, n);
  *p += TO_IN;  *n -= TO_IN;
}

/**
 * PARSE-AREA@  ( -- unparsed.s )
 *
 * Leave the parse area portion of the input buffer as a string.
 * Factored from Bernd Paysan's version of Frederick Warren's
 * $>, comp.lang.forth, March, 2000.  Named by Michael Gassanenko. 
 * <ansref>"parse-area-fetch"</ansref>
 *
 * : parse-area@  ( -- unparsed.s )
 *   source ( 'inbuf #inbuf) >in @ ( #parsed) /string ;
 */
FCode (dw_parse_area_fetch)
{
  p4cell n;
  const p4_char_t *p;

  SP -= 2;
  dw_parse_area (&p,&n );
  SP[0] = n; SP[1] = (p4cell) p;
}

/**
 * PARSE-AREA!  ( unparsed.s -- )
 *
 * The unparsed.s string is assumed to lie at the end of the
 * input buffer.  Advance the input stream so that unparsed.s is
 * the parse area.  Named by Michael Gassanenko.
 * <ansref>"parse-area-store"</ansref>
 *
 * : parse-area!  ( unparsed.s -- )
 *   ( addr len) source nip ( #tib) swap - >in ! ( addr) drop ;
 */
FCode (dw_parse_area_store)
{
  TO_IN = NUMBER_TIB - *SP;
  SP += 2;
}

/**
 * EMPTY-PARSE-AREA  ( -- )
 *
 * Exhaust the input buffer.  <ansref>"empty-parse-area"</ansref>
 *
 * Note:  REFILL can't always be used for this.  When the source
 * is a string, REFILL does nothing.  When it is a file, one may
 * not want the current input buffer to be written over.
 *
 * : empty-parse-area  ( -- )
 *   source ( len) >in ! ( addr) drop ;
 */
FCode (dw_empty_parse_area)
{
  TO_IN = NUMBER_TIB ;
}

/**
 * PARSE-AREA-EMPTY?  ( -- flag )
 */
FCode (dw_parse_area_empty_Q)
{
  *--SP = ( TO_IN == NUMBER_TIB ) ? ~0 : 0 ;
}

#if 0
/**
 * PARSE-NAME  ( "<name>" -- word.s )     CORE EXT
 *
 * Leave the next word in the parse area, and advance the input
 * stream to just after the first of any trailing whitespace
 * characters.  If there is no word in the parse area, word.s is
 * the right empty string, and the parse area is left exhausted.
 */
FCode (dw_parse_name)
{
  p4char *p = (p4char *) (TIB + TO_IN);
  p4ucell l = NUMBER_TIB - TO_IN, lrem;

  SP -= 2;
  while ( l &&  ISWHITE (*p) ) { l--; p++; }
  SP[1] = (p4cell) p;
  lrem = l;
  while ( l && !ISWHITE (*p) ) { l--; p++; }
  SP[0] = lrem - l;
  TO_IN = ( l-- > 0 ) ? NUMBER_TIB - l : NUMBER_TIB ;
}
#endif

/**
 * PREPARSE-NAME  ( "<name>" -- word.s )
 *
 * Get the next word without advancing the input stream.  Leo
 * Wong's PREPARSE.
 */
FCode (dw_preparse_name)
{
  p4char *p = (p4char *) TIB + TO_IN;
  p4ucell l = NUMBER_TIB - TO_IN, lrem;

  SP -= 2;
  while ( l &&  ISWHITE (*p) ) { l--; p++; }
  SP[1] = (p4cell) p;
  lrem = l;
  while ( l && !ISWHITE (*p) ) { l--; p++; }
  SP[0] = lrem - l;
}

/**
 * PARSE-NAME-AWAY  ( "<name>" -- )
 *
 * When the parse area is not all white space, parse away the
 * next word and the first of any trailing whitespace
 * characters.  Otherwise empty the parse area.  A logical
 * companion to PREPARSE-NAME.
 */ 
FCode (dw_parse_name_away)
{
  p4char *p = (p4char *) (TIB + TO_IN);
  p4ucell l = NUMBER_TIB - TO_IN;

  while ( l &&  isspace (*p) ) { l--; p++; }
  while ( l && !isspace (*p) ) { l--; p++; }
  TO_IN = ( l-- > 0 ) ? NUMBER_TIB - l : NUMBER_TIB ;
}

#define TO_IN_PP    if (TO_IN < NUMBER_TIB) ++TO_IN

/** >IN++  ( -- )
 *
 * Step the input stream one character forward, unless it is
 * exhausted.  <ansref>"to-in-increment"</ansref>
 */
FCode (dw_to_in_increment)
{
  TO_IN_PP;
}

/**
 * >IN--  ( -- )
 *
 * Step the input stream one character back, unless it is
 * positioned at its start.  <ansref>"to-in-decrement"</ansref>
 */
FCode (dw_to_in_decrement)
{
  if (TO_IN) --TO_IN;
}

/**
 * {}PARSE  ( 'delims -- parsed.s found.char )
 *
 * Scan the current SOURCE buffer for any of the null-terminated
 * sequence of characters at 'delims as delimiter, beginning at
 * the current position in the buffer.  If a delimiting
 * character is found, leave it as found.char.  If a delimiting
 * character is not found, leave zero as found.char.
 * 
 * If a delimiting character is found, the input stream is left
 * positioned just after it, and parsed.s is the initially
 * unparsed part of the input stream up to just before
 * found.char.
 *
 * If a delimiting character is not found, the input stream is
 * left exhausted, and parsed.s is the initially unparsed part
 * of the input stream.  <ansref>"set-parse"</ansref>
 */
FCode (dw_set_parse)  /* IMPROVE!! */
{ 
  /* d = 0 to stop warning: 'd' may be used uninitialized */
  p4char c, d = 0, *dels = (p4char *) *SP;
  const p4char *p;
  p4cell i0 = TO_IN, i, j, n, found = 0;

  SP -= 2;
  p4_source (&p, &n);
  p += i0;
  SP[2] = (p4ucell) p;

  for (i = i0; i < n && found == 0; i++)
  {
    c = *p++;
    j = 0;
    while ((d = dels[j++]) != 0)
    {
      if (c == d || (d == ' ' && c < ' '))
      {
        found = 1; break;
      }
    }
  }

  TO_IN = i;
  if (found)
  {
    *SP = d;
    SP[1] = i - i0 - 1;
  }
  else
  {
     *SP = 0;
     SP[1] = i - i0;
  }
}

/**
 * SPACE|PUNCT-PARSE  ( "<char>" -- parsed.s found.char|0 )
 *
 * Scan the current SOURCE buffer for a whitespace or any
 * printing character except a letter or digit, beginning at the
 * current position in the buffer.  If found, leave it as
 * found.char, with blank substituted for any whitespace
 * character, else leave zero.
 *
 * If one of the characters is found, the input stream is left
 * positioned just after it, and parsed.s is the initially
 * unparsed part of the input stream up to just before
 * found.char.
 *
 * If none is found, the input stream is left exhausted, and
 * parsed.s is the initially unparsed part of the input stream. 
 * <ansref>"space-or-punct-parse"</ansref>
 */
FCode (dw_space_or_punct_parse)
{ 
  p4char c, found = 0;
  const p4char *p;
  p4cell i0 = TO_IN, i, n;
  int spacefl = 0;

  SP -= 3;
  p4_source (&p, &n);
  p += i0;
  SP[2] = (p4ucell) p;

  for (i = i0; i < n; i++)
  {
    c = *p++;
    if ( ispunct (c) || (spacefl = ISWHITE (c)) )
    {
      found = spacefl ? ' ' :  c;
      break; 
    }
  }

  TO_IN = found ? i + 1 : i;
  SP[1] = i - i0;
  *SP = (p4cell) found;
}

#if 0
/**
 * {}SKIP-PARSE  ( 'delims -- )
 *
 * Leave the input stream for the current SOURCE buffer
 * positioned just after leading instances of any of the
 * null-terminated sequence of characters at 'delims, starting
 * at the current position.  If the input stream has only
 * leading delimiters, it is left exhausted.
 */
FCode (dw_set_skip_parse)
{
}
#endif


/****************************
  2.2 Across Lines or Blocks
 ****************************/

/**
 * "Input stream" refers to the input source across lines.
 *
 * With the exception of \\, all across-line words in this
 * section echo CR's when the lines are being entered
 * interactively at the terminal.  That feature has been tested
 * by copying and pasting the file parsing-test.fs into a
 * terminal window in which pfe is running.  See parsing-test.fs
 * for an explanation.
 */

/**
 * \\  ( "<eof>" -- )  2002
 *
 * Since REFILL [as well as p4_refill()] does nothing but return
 * false when the source is a string, the following covers the
 * string case, too.  It is of no use when the input source is
 * the terminal.
 */
FCode (dw_backslash_backslash)
{
  TO_IN = NUMBER_TIB ;
  while ( p4_refill () ) ;
}

/**
 * ?EMIT-CR  ( -- )
 *
 * For words that parse or collect text across lines and need to
 * echo a CR when the input is the terminal.
 */
FCode (dw_Q_emit_cr)
{
  if ( !SOURCE_ID ) FX (p4_cr);
}

/**
 * NEXT-INSTREAM-NAME  ( "<name>" -- word.s )  \ Wil Baden's NEXT-WORD
 *
 * Leave the next word in the input stream, and advance the
 * input stream to just after the first of any trailing
 * whitespace characters.  If word.s is empty, at most
 * whitespace was found.  A parsing implementation of Wil
 * Baden's NEXT-WORD, plus conditional echoing of CR's.
 */
FCode (dw_next_instream_name)
#if 0
{
  const p4char *p = (TIB + TO_IN), *q;
  p4cell l = NUMBER_TIB - TO_IN, lrem = l;
  int eof = 0;

  while ( !eof )
  {
    while ( l &&  ISWHITE (*p) ) { l--; p++; }
    q = p;  /* p is name pointer candidate */
    lrem = l;

    while ( l && !ISWHITE (*q) ) { l--; q++; }
    TO_IN = NUMBER_TIB - l;
    l = lrem - l;

    if ( l ) break;
    FX ( dw_Q_emit_cr );
    eof = !p4_refill (); p4_source (&p, &l);
  }

  SP -= 2;
  if ( !eof)
  { SP[0] = l; SP[1] = (p4cell) p; }
  else
  { SP[0] = 0; SP[1] = (p4cell) (p + l); }
}
#else
{
  const p4char *p = (TIB + TO_IN), *q;
  p4cell l = NUMBER_TIB - TO_IN, lrem = l;
  int eof = 0;

  while ( !eof )
  {
    while ( l &&  ISWHITE (*p) ) { l--; p++; }
    q = p;  /* p is name pointer candidate */
    lrem = l;

    while ( l && !ISWHITE (*q) ) { l--; q++; }
    TO_IN = ( l > 1 ) ? NUMBER_TIB - l + 1 : NUMBER_TIB ;
    l = lrem - l;

    if ( l ) break;
    FX ( dw_Q_emit_cr );
    eof = !p4_refill (); p4_source (&p, &l);
  }

  SP -= 2;
  if ( !eof)
  { SP[0] = l; SP[1] = (p4cell) p; }
  else
  { SP[0] = 0; SP[1] = (p4cell) (p + l); }
}
#endif

_export int
dw_s_scan_input (const p4_char_t *addr, p4cell len)
{
  int eof=0, pa_len;
  const p4_char_t *pa_addr;
  p4_char_t *q;

  dw_parse_area (&pa_addr, &pa_len);
  while
  (
    !eof &&
    !( q = (p4_char_t *) p4_search ( (const char *) pa_addr, pa_len,
    				     (const char *) addr, len ) )
  )
  {
    FX ( dw_Q_emit_cr );
    eof = !p4_refill (); p4_source (&pa_addr, &pa_len);
  }

  if (q) TO_IN = q - TIB;
  else   TO_IN = NUMBER_TIB;
  return !eof;
}

/**
 * S-SEEK-INSTREAM  ( pat.s -- flag )
 *
 * Assume pat.s to be nonempty.  Advance the input stream to
 * just after the next occurrence of the string, and leave true
 * if found.  If the string is not found before the end of the
 * file, leave the input stream positioned there with false.
 */
FCode (dw_s_seek_instream)
{
  p4cell len = *SP++;
  const p4_char_t *p = (p4_char_t *) *SP;

  if ( dw_s_scan_input (p, len) ) { TO_IN += len; *SP = ~0; }
  else *SP = 0 ;
}

_export int
dw_white_delimited ( const p4_char_t *p, p4cell len,
		     const p4_char_t *cbuf, p4cell cbufsz )
{
  int left, right;

  if ( !( left = (p == cbuf) ) ) left = WHITE_Q (*(p-1)); 
  cbuf += cbufsz; p += len;
  if ( !( right = (p == cbuf) ) ) right = WHITE_Q (*p);
  return left && right ;
}

/**
 * WHITE-DELIMITED?  ( s cbuf #cbuf -- flag )
 *
 * Assume s to be a nonempty substring of the characters in
 * the buffer.  If the beginning or end of s is either at the
 * beginning or end of the buffer, or is preceded or followed,
 * respectively, by a whitespace character in the buffer, leave
 * true.  Else leave false.
 */
FCode (dw_white_delimited_Q)
{
  int flag = dw_white_delimited ( (const p4_char_t *)SP[3], SP[2],
				  (const p4_char_t *)SP[1], SP[0] );
  SP += 3;
  *SP = flag ? ~0 : 0 ; 
}

/**
 * |S|-SEEK-INSTREAM  ( pat.s -- flag )
 *
 * Assume pat.s to be nonempty.  Assume that s is nonempty. 
 * Advance the input stream to just after the next occurrence of
 * the string, matching embedded whitespace characters exactly,
 * delimited by whitespace on both ends; and leave true if
 * found.  If the delimited string is not found after REFILL
 * attempts, empty the parse area and leave false.
 */
FCode (dw_bar_s_bar_seek_instream)
{
  p4cell len = *SP++, pa_len, src_len;
  const p4_char_t *addr = (p4_char_t *) *SP, *pa_addr, *src_addr;

  while ( dw_s_scan_input (addr, len) )
  {
    dw_parse_area (&pa_addr, &pa_len);
    p4_source (&src_addr, &src_len);
    if ( dw_white_delimited ( pa_addr, len, src_addr, src_len) )
    {
      TO_IN += len; *SP = ~0; return;
    }
    TO_IN_PP;
  }
  *SP = 0;
}

/**
 * (*  ( -- )
 *
 * Equivalent of
 *
 *   s" *)" |s|-input-past drop ; immediate
 *
 * Proposed for ANS Forth 20xx.
 */
FCode (dw_parens_star)
{
  SP -= 2;
  SP[0] = 2; SP[1] = (p4ucell) &"*)";
  FX ( dw_bar_s_bar_seek_instream );
  if ( !*SP++ ) p4_abortq ("***Missing *) terminator");
}

#if 0
/**
 * ---  ( -- )
 *
 * Equivalent of
 *
 *   s" ---" |s|-input-past drop ; immediate
 */
FCode (dw_3_dashes)
{
  SP -= 2;
  SP[0] = 3; SP[1] = (p4ucell) &"---";
  FX ( dw_bar_s_bar_seek_instream );
  SP++;
}
#endif

#if 0
/**
 * ((  ( -- )
 *
 * Equivalent of
 *
 *   s" ))" |s|-input-after drop ; immediate
 */
FCode (dw_parens_parens)
{
  SP -= 2;
  SP[0] = 2; SP[1] = (p4ucell) &"))";
  FX ( dw_bar_s_bar_seek_instream );
  SP++;
}
#endif


/*******
  Words
 *******/

P4_LISTWORDS (parsing) =
{
/* Lib identifier */
   P4_FXco ("PARSING-LIB-S",	dw_parsing_lib_s),
   
/* 1.1 ToolBelt 2002 */
   P4_FXco ("STRING/",		dw_string_slash),
   P4_FXco ("END-C@",		dw_end_c_fetch),
   P4_FXco ("SKIP",		dw_skip),
   P4_FXco ("SCAN",		dw_scan),
   P4_FXco ("SCAN-BACK",	dw_scan_back),
   P4_FXco ("BACK",		dw_scan_back),
   P4_FXco ("WHITE?",		dw_white_Q),
   P4_FXco ("TRIM\x7c",		dw_trim_bar),
   P4_xOLD ("TRIM",		"TRIM\x7c"),

/* 1.2 ToolBelt 2000 */
   P4_FXco ("/SPLIT",		dw_cut_split),
   P4_FXco ("S-STARTS",		dw_s_starts),
   P4_xOLD ("STARTS?",		"S-STARTS"),
   P4_FXco ("S-STARTS-NC",	dw_s_starts_nc),
   P4_xOLD ("STARTS-NC?",	"S-STARTS-NC"),
   P4_FXco ("S-ENDS",		dw_s_ends),
   P4_xOLD ("ENDS?",		"S-ENDS"),
   P4_FXco ("S-ENDS-NC",	dw_s_ends_nc),
   P4_xOLD ("ENDS-NC?",		"S-ENDS-NC"),
   P4_xOLD ("BL-SKIP",		"SKIP"),
   P4_xOLD ("BL-SCAN",		"SCAN"),

/* 1.3 Other String Extensions */
   P4_FXco ("CUT-FIRST",	p4_slash_string),
   P4_FXco ("CUT-LAST",		p4_minus),
   P4_FXco ("KEEP-FIRST",	p4_nip),
   P4_FXco ("KEEP-LAST",	dw_string_slash),
   P4_FXco ("SKIP-BACK",	dw_skip_back),
   P4_FXco ("\x7cTRIM",		dw_bar_trim),
   P4_FXco ("\x7cTRIM\x7c",	dw_bar_trim_bar),
   P4_FXco ("SEEK",		dw_seek),
   P4_FXco ("{}IN",		dw_set_in),
   P4_xOLD ("{}?",		"{}IN"),
/* P4_FXco ("{}SKIP",		dw_set_skip), */
   P4_FXco ("{}SCAN",		dw_set_scan),
   P4_FXco ("{}SEEK",		dw_set_seek),
   P4_xOLD ("{}SCAN-PAST",	"{}SEEK"),
   P4_FXco ("{}INDEX",		dw_set_index),
   P4_FXco ("{}STARTS",		dw_set_starts),
   P4_FXco ("{}ENDS",		dw_set_ends),
   P4_FXco ("SEPARATE",		dw_separate),
   P4_FXco ("{}SEPARATE",	dw_set_separate),
   P4_FXco ("S-SEPARATE",	dw_s_separate),
   P4_FXco ("S-AFTER",		dw_s_after),
   P4_xOLD ("SAFTER",		"S-AFTER"),
/* P4_FXco ("S-SKIP",		dw_s_skip), */
/* P4_FXco ("S-SCAN",		dw_s_scan), */
/* P4_FXco ("S-SEEK",		dw_s_seek), */
   P4_FXco ("FIRST-WORD",	dw_first_word),
   P4_FXco ("SEPARATE-WORD",	dw_separate_word),

/* 2.1 Parse Area */
   P4_FXco ("PARSE-AREA@",	dw_parse_area_fetch),
   P4_FXco ("PARSE-AREA!",	dw_parse_area_store),
   P4_FXco ("EMPTY-PARSE-AREA",	dw_empty_parse_area),
   P4_FXco ("PARSE-AREA-EMPTY?",  dw_parse_area_empty_Q),
   P4_xOLD ("EXHAUST-INPUT",	"EMPTY-PARSE-AREA"),
   P4_FXco (">IN++",		dw_to_in_increment),
   P4_FXco (">IN--",		dw_to_in_decrement),
#if 0
   P4_FXco ("PARSE-NAME",	dw_parse_name),
#endif
   P4_FXco ("PREPARSE-NAME",	dw_preparse_name),
   P4_FXco ("PARSE-NAME-AWAY",	dw_parse_name_away),
   P4_FXco ("{}PARSE",		dw_set_parse),
   P4_FXco ("SPACE|PUNCT-PARSE",  dw_space_or_punct_parse),

/* 2.2 Across Lines or Blocks */
   P4_FXco ("\\\\",		dw_backslash_backslash),
   P4_FXco ("?EMIT-CR",		dw_Q_emit_cr),
   P4_FXco ("NEXT-INSTREAM-NAME",  dw_next_instream_name),
/* P4_FXco ("{}SKIP-INSTREAM",	dw_set_skip_instream), */
/* P4_FXco ("{}SCAN-INSTREAM",	dw_set_scan_instream), */
/* P4_FXco ("{}SEEK-INPUTSTREAM",  dw_set_seek_inputstream), */
   P4_FXco ("S-SEEK-INSTREAM",	dw_s_seek_instream),
   P4_xOLD ("S-INPUT-PAST",	"S-SEEK-INSTREAM"),
   P4_FXco ("WHITE-DELIMITED?",	dw_white_delimited_Q),
   P4_FXco ("\x7cS\x7c-SEEK-INSTREAM",  dw_bar_s_bar_seek_instream),
   P4_IXco ("(*",		dw_parens_star),
/* P4_IXco ("((",		dw_parens_parens), */
/* P4_IXco ("---",		dw_3_dashes), */
};

P4_COUNTWORDS (parsing, "parsing primitives");