/** * Title: Parsing ANS Forth strings and * the input stream * File: parsing-ext.c * Author: David N. Williams * License: LGPL * Version: 1.0.0 * Last revision: October 26, 2010 * * Some of this file is derived from Wil Baden's ToolBelt, which * we believe to be in the public domain. We have borrowed * freely from the pfe implementation of the ToolBelt. For the * sake of the LGPL, the rest is * * Copyright (C) 1996-2008, 2010 by David N. Williams * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later * version. * * This library is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR * PURPOSE. See the GNU Library General Public License for moref * details. * * You should have received a copy of the GNU Lesser General * Public License along with this library; if not, write to the * Free Software Foundation, 59 Temple Place, Suite 330, Boston, * MA 02111-1307 USA. * * If you take advantage of the option in the LGPL to put a * particular version of this library under the GPL, the author * would regard it as polite if you would put any direct * modifications under the LGPL as well, and include a copy of * this request near the beginning of the modified library * source. A "direct modification" is one that enhances or * extends the library in line with its original concept, as * opposed to developing a distinct application or library which * might use it. */ /********** Preamble **********/ /** * This is an external pfe module, not part of the pfe * distribution. * * This extension word set is for parsing ANS Forth strings, the * parse area of the input stream, and the input stream across * lines or blocks. Some of it is based on ^Forth Motorola * 680x0 parsing words going back to before 1996. Some is based * on Wil Baden's ToolBelt 2000 or 2002, with a few name * changes. * * See the file parsing.fs for an ANS Forth implementation, and * the file parsing-test.fs for Hayes tests. * * Since they are based on REFILL, words that act across lines * are expected to work across blocks as well, but have not been * tested with blocks. * * To be safe for all input sources, it's important that the * parse area be accessed only through p4_source() and TO_IN. * Unfortunately, the code in this file uses TIB and NUMBER_TIB * as well. That will be fixed in a later version. */ #define _P4_SOURCE 1 #include #include /* memcmp() */ #include /* toupper(), ispunct() */ /* We use the prefix "dw_" to avoid possible label conflicts * with pfe prefixes. */ #include "parsing-ext.h" static char libid[] = "parsing-ext pfe module dnw-1.0.0"; /** * PARSING-LIB-S ( -- ident.s ) */ FCode (dw_parsing_lib_s) { *--SP = (p4cell) libid; *--SP = strlen( libid ); } /******************* Table of Contents *******************/ /*** * 0 Notation and Terminology * 1 ANS Forth Strings * 1.1 ToolBelt 2002 * 1.2 ToolBelt 2000 * 1.3 Other String Extensions * * 2 Input Stream * 2.1 Parse Area * 2.2 Across Lines or Blocks */ /**************************** 0 Notation and Terminology ****************************/ /** * c[n]: The number of address units for n characters. The * result of ( n) CHARS. * * s, or .s suffix: Short for the pair representation * [addr len), an fstring. * * x.len: The len part of x = (addr len). * x.addr: The addr part of x = (addr len). * * s = s', or "s is the same as s'": If s is represented by * (addr len) and s' by (addr' len'), then addr = addr' and * len = len'. That is, the two fstrings have the same * representation. * * s' is a substring of s: addr <= addr' <= addr+c[len] and * addr'+c[len'] <= addr+c[len]. * * s is equivalent to s': The lengths of s and s' are the same. * The addresses of the string bodies need not be the same, * but if the lengths are nonzero, the bodies have the same * sequence of characters. * * right empty string: For s = (addr len), the right empty * string is (addr+c[len] 0), just off its end. According to * the definition of a substring, the right empty string of s * is a substring of s, even though there is no character of s * at its address. That includes the case where s is empty. * * left empty string: For s = (addr len), the left empty string * is (addr 0), at its beginning. According to the definition * of a substring, the left empty string is a substring of s, * although there is no character of s at its address. That * includes the case where s is empty. * * Certain specifications in the ToolBelt imply one of these * when an empty string results. Our specs could have been * written to be independent of such implementation details, * but we find it clearer and simpler to adopt the same * practice. * * whitespace: In this implementation, any of the ASCII * characters 0x0 ... 0x20. * * word in s: A substring of s satisfying all of the following: * 1. it is not empty; * 2. it contains no whitespace characters; * 3. it is preceded by a whitespace character or is at the * beginning of s; * 4. it is followed by a whitespace character or is at the * end of s. * * "": In stack comments, the next word in the parse * area, or the next word in the input stream if the comment * applies across lines. * * cbuf, #cbuf: In stack comments cbuf is the address of a * character aligned buffer. The number of characters the * buffer can hold is #cbuf. */ /********************* 1 ANS Forth Strings *********************/ /** * ANS Forth strings are represented on the data stack by an * (addr len) pair, where len is the number of characters. */ #define ISWHITE( C ) ( (unsigned char) C <= ' ' ) #define WHITE_Q( C ) ( ISWHITE( C ) ? ~0 : 0 ) /******************* 1.1 ToolBelt 2002 *******************/ /** * "ToolBelt 2002" refers to the 2003-02-25 revision. */ /** * STRING/ ( addr len i -- addr+c[len-i] i ) * * In the "normal case", where i is nonnegative and len-i is * nonnegative, the output string is the last i characters of * the input string. "string-slash" */ FCode (dw_string_slash) { p4cell i = SP[0], len = SP[1]; SP += 1; SP[1] = (p4cell) ( (p4char *) SP[1] + len - i ); SP[0] = i; } /** * END-C@ ( s -- char ) * * Assume the input string is not empty. Leave its last * character. */ FCode (dw_end_c_fetch) { p4cell len = SP[0]; p4char *addr = (p4char*) SP[1]; *++SP = (p4char) *(addr + len - 1); } /** * SKIP ( addr len char -- addr+c[i] len-i ) * * Skip leading instances of char. When char is BL, any leading * whitespace char is skipped. "skip" * * NOTE: Based on Guido Draheim's pfe implementation of SKIP * and BL-SKIP. */ FCode (dw_skip) { p4char *p = (p4char *) SP[2]; p4cell n = SP[1]; p4char c = (p4char) *SP++; if ( c == ' ' ) { while ( n && ISWHITE (*p) ) {n--; p++;} } else { while ( n && *p == c ) {n--; p++;} } SP[1] = (p4cell) p; SP[0] = n; } /** * SCAN ( str len char -- str+c[i] len-i ) * * Scan the string for the first occurrence of char. If found, * leave the rest of the string, starting at char. If not found * and if the input string is empty, leave it unchanged; * otherwise leave zero length and the address just after the * end of the string. When char is BL it is matched by any * whitespace. "scan" * * NOTE: Based on Guido Draheim's pfe implementation of SCAN * and BL-SCAN. */ FCode (dw_scan) { p4char *p = (p4char *) SP[2]; p4cell n = SP[1]; p4char c = (p4char) *SP++; if (c != ' ') { while ( n && *p != c ) { n--; p++; } } else { while ( n && !ISWHITE (*p) ) { n--; p++; } } SP[0] = n; SP[1] = (p4cell) p; } /** * scan-back ( addr len char -- addr len-i ) 2002 BACK * * Scan the input string backwards for char, or whitespace if * char is BL. The output string either ends with the first * backward occurrence of char, respectively, whitespace, or is * the left empty string. * * Note: We prefer the name SCAN-BACK to the ToolBelt's BACK, * because with no qualification, BACK could just as well mean * SKIP-BACK. */ FCode (dw_scan_back) { p4cell n = SP[1]; p4char *p = (p4char *) SP[2] + n; p4char c = (p4char) *SP++; if ( c == ' ' ) { while ( n && !ISWHITE (*--p) ) n--; } else { while ( n && *--p != c ) n--; } *SP = n; } /** * WHITE? ( char -- flag ) 2002 Is-White? */ FCode (dw_white_Q) { *SP = WHITE_Q ( *SP ); } /** * TRIM| ( addr len -- addr len-i ) 20002 TRIM * * Trim trailing whitespace from the input string. If the input * is all whitespace, the result is the left empty string. * * Note: The ToolBelt name is TRIM (the same in 2000 and 2002), * which it seems to us would more naturally mean to remove both * leading and trailing whitespace. */ FCode (dw_trim_bar) { *--SP = ' '; FX (dw_skip_back); } /******************* 1.2 ToolBelt 2000 *******************/ /** * /SPLIT ( addr len addr+c[i] len-i -- addr+c[i] len-i addr i ) * * Split the character string (addr len) at the place given by * (addr+c[i] len-i]. Wil Baden's "cut-split". ToolBelt 2002 * makes this a deprecated synonym for CHOP. * "cut-split" */ FCode (dw_cut_split) { p4cell len = SP[2], lenp = SP[0]; p4char *addr = (p4char*) SP[3], *addrp = (p4char*) SP[1]; SP[0] = len - lenp; SP[1] = (p4cell) addr; SP[2] = lenp; SP[3] = (p4cell) addrp; } /** * S-STARTS ( s pat.s -- s flag ) 2000 STARTS? * * Leave true if the string s starts with pat.s. From Wil * Baden's tool belt. We use the pfe implementation from * toolbelt-ext.c. "starts-question" * * NOTE: The flag is true if pat.s is empty or if both s and * pat.s are empty. */ FCode (dw_s_starts) { p4cell n = *SP++; if ( SP[1] < n || memcmp ((char*) SP[2], (char*) SP[0], n) ) *SP = 0; else *SP = ~0; } /** * S-STARTS-NC ( s uc.pat.s -- s flag ) * * Leave true if the string s starts with uc.pat.s. The pattern * string is assumed to be all upper case. Characters at the * beginning of s are forced to upper case before comparison, * without changing s. "starts-n-c-question" * * NOTE: The flag is true if pat.s is empty or if both s and * pat.s are empty. */ FCode (dw_s_starts_nc) { p4cell n = *SP++; char *p = (char *) SP[2], *q = (char *) *SP; if (SP[1] < n) { *SP = 0; return; } while (n) { if (*q != toupper (*p)) break; p++; q++; n--; } *SP = n ? 0 : ~0; } /** * S-ENDS? ( s pat.s -- s flag ) 2000 ENDS? * * Leave true if the string s ends with pat.s. From Wil Baden's * tool belt. We use the pfe implementation from * toolbelt-ext.c. "ends-question" */ FCode (dw_s_ends) { p4cell n = *SP++; if ( SP[1] < n || memcmp ((char*) SP[2] + (SP[1] - n), (char*) SP[0], n) ) *SP = 0; else *SP = ~0; } /** * S-ENDS-NC ( s uc.pat.s -- s flag ) * * Leave true if the string s ends with uc.pat.s. The pattern * string is assumed to be all upper case. Characters at the * end of s are forced to upper case before comparison, without * changing s. "ends-n-c-question" * * NOTE: The flag is true if pat.s is empty or if both s and * pat.s are empty. */ FCode (dw_s_ends_nc) { p4cell n = *SP++; p4cell l = SP[1]; char *p = (char *) SP[2] + l - n, *q = (char *) *SP; if (SP[1] < n) { *SP = 0; return; } while (n) { if (*q != toupper (*p)) break; p++; q++; n--; } *SP = n ? 0 : ~0; } /***************************** 1.3 Other String Extensions *****************************/ /** * All "{}" words treat BL as a literal character, not as * shorthand for white-space. */ /** * The following four words are implemented in this library as * aliases. They assume that len and the output length are * nonnegative. * * : CUT-FIRST ( addr len i -- addr+c[i] len-i ) /string ; * : CUT-LAST ( addr len i -- addr len-i ) - ; * : KEEP-FIRST ( addr len i -- addr i ) nip ; * : KEEP-LAST ( addr len i -- addr+c[len-i] i ) string/ ; */ /** * SKIP-BACK ( addr len char -- addr len-i | empty.s ) * * Trim trailing instances of char from the input string, or of * whitespace if char is BL. TRIM from the ToolBelt 2002 is * essentially the same as BL SKIP-BACK, which is the same as * the ANS word -TRAILING, up to implementation dependence on * how -TRAILING treats white space. * * Note: This implementation produces empty.s = (addr 0). */ FCode (dw_skip_back) { p4cell n = SP[1]; p4char *p = (p4char *) SP[2] + n; p4char c = (p4char) *SP++; if ( n ) p-- ; if ( c == ' ' ) { while ( n && ISWHITE (*p) ) { n--; --p; } } else { while ( n && *p == c ) { n--; --p; } } *SP = n; } /** * |TRIM ( s -- s' ) * * Leave the fstring s' resulting from trimming leading * whitespace from s. If s is all whitespace or empty, s' is * its right empty * string. */ FCode (dw_bar_trim) { *--SP = ' '; FX (dw_skip); } /** * |TRIM| ( s -- s' ) * * Leave the fstring s' resulting from trimming leading and * trailing whitespace from s. If s is all whitespace or empty, * s' is its right empty string. */ FCode (dw_bar_trim_bar) { *--SP = ' '; FX (dw_skip); *--SP = ' '; FX (dw_skip_back); } /** * SEEK ( addr len char -- addr+c[i] len-i found? ) * * Look for char in the input string, or whitespace if char is * BL. If the character is found, the output string begins just * after its first occurrence, or is the right empty string, and * found? is true. Else the output string is the right empty * string and found? is false. "seek" * * : seek ( ... -- ... ) scan dup IF 1 /string true ELSE false THEN ; */ FCode (dw_seek) { p4char *p = (p4char *) SP[2]; p4cell n = SP[1], found; p4char c = (p4char) SP[0]; if (c != ' ') { while ( n && *p != c ) { n--; p++; } } else { while ( n && !ISWHITE (*p) ) { n--; p++; } } if (n) { n--; p++; found = ~0; } else { found = 0; } SP[0] = found; SP[1] = n; SP[2] = (p4cell) p; } /** * {}IN ( char 'chars -- char|0 ) * * If the character is in the null-terminated list of * characters, leave it. Else leave 0. * "set-in" */ FCode (dw_set_in) { p4char *p = (p4char *) *SP++, c = *SP, elem; int i = 0; while ( (elem = *p++) && (elem != c) ) i++ ; *SP = elem; } /** * {}SCAN ( s &chars -- s' ) * * Leave the substring s' of s starting at the first occurrence * of one of the characters in the null-terminated string * starting at &chars. If no such character is found and s is * not empty, s' is empty. If s is empty, s' is s. * * If BL is in the set of delimiters, it is treated literally, * not as a shorthand for whitespace. * * In this implementation, when empty, s' is the right empty * string. */ FCode (dw_set_scan) { p4char c, d, *dels = (p4char *) *SP++, *p = (p4char *) SP[1]; p4cell j, n = SP[0]; while ( n != 0 ) { n--; c = *p++; j = 0; while ( (d = dels[j++]) != 0 ) { if ( c == d ) { n++; p--; goto push_result; } } } push_result: SP[0] = n ; SP[1] = (p4ucell) p; } /** * {}SEEK ( s &chars -- s' char|0 ) * * Scan s for the first occurrence of any of the characters in * the null-terminated string starting at &chars. If the * character is found, leave it, with s' the part of s that * follows it, or the right empty string if it is found at the * end of s. If s is empty, leave s and zero. If s is nonempty * and the character is not found, leave zero and the right * empty string. */ FCode (dw_set_seek) { p4char c, d, *dels = (p4char *) SP[0], *p = (p4char *) SP[2]; p4cell n = SP[1], i = n, j; while ( i != 0 ) { i--; c = *p++; j = 0; while ( (d = dels[j++]) != 0 ) { if ( c == d ) { goto push_result; } } } d = 0; push_result: SP[0] = d; SP[1] = i; SP[2] = (p4ucell) p; } /** * {}INDEX ( char 'chars -- index|-1 ) * * If char is in the null-terminated list of characters at * &chars, leave its index. Else leave -1. The first character * has index zero. "set-index" */ FCode (dw_set_index) { p4char *p = (p4char *) *SP++, c = SP[0], elem; p4cell i = 0; while ( (elem = *p++) && (elem != c) ) i++ ; *SP = elem ? i : -1 ; } /** * {}STARTS ( s 'chars -- s char|0 ) * * If the string is not empty and its first character is in the * null-terminated list of characters, leave the character and * the original string. Else leave zero and the original * string. "set-starts-question" */ FCode (dw_set_starts) { p4char c, d, *dels = (p4char *) SP[0]; if (!SP[1]) { *SP = 0 ; return; } c = *(p4char *) SP[2]; while ( (d = *dels++) && (d != c) ); *SP = (p4cell) d; /* zero if not found */ } /** * {}ENDS ( s 'chars -- s char|0 ) * * If the string is not empty and its last character is in the * null-terminated list of characters, leave the character and * the original string. Else leave zero and the original * string. "set-starts-question" */ FCode (dw_set_ends) { p4char c, d, *dels = (p4char *) SP[0]; if (!SP[1]) { *SP = 0 ; return; } c = *( (p4char *) SP[2] + SP[1] - 1 ); while ( (d = *dels++) && (d != c) ); *SP = (p4cell) d; /* zero if not found */ } /** * SEPARATE ( s char -- after.s before.s true | s false ) * * Scan s for the first occurrence of char, or whitespace in * case char is BL. If found, leave true and the strings after * and before char. If not found, leave s and false. * "separate" */ FCode (dw_separate) { p4char *p = (p4char *) SP[2]; p4char c = (p4ucell) SP[0]; p4ucell i = 0, n = (p4ucell) SP[1]; if (c == ' ') { while ( (i < n) && !ISWHITE (p[i]) ) i++ ; } else { while (i < n && p[i] != c) i++; } if (i == n) *SP = 0; else { SP -= 2; SP[0] = ~0; SP[1] = i; SP[2] = SP[4]; SP[3] = n - i - 1; SP[4] = (p4cell) (p + i + 1); } } /** * {}SEPARATE ( s 'delims -- after.s before.s found.char | s false ) * * Scan s for the first occurrence of any of the null-terminated * sequence of characters at 'delims as delimiter. Bl is * treated literally, not as shorthand for whitespace. If a * delimiting character is found, leave it as found.char, and * leave the strings after and before. If a delimiting * character is not found, leave zero instead of found.char and * the original string. "set-separate" */ FCode (dw_set_separate) { p4char c, d, *dels = (p4char *) SP[0]; p4ucell i = 0, j, n = (p4ucell) SP[1]; p4char *p = (p4char *) SP[2]; while ( i < n ) { c = p[i++]; j = 0; while ( (d = dels[j++]) != 0 ) { if ( c == d ) goto found; } } *SP = 0 ; return; found: SP -= 2; *SP = c; SP[1] = i - 1; SP[2] = SP[4]; SP[3] = n - i; SP[4] = (p4cell) (p + i); } /** * S-SEPARATE ( s pat.s -- after.s before.s true | s false ) * * Scan s for the first occurrence of the pattern string. If * found, leave true and the strings after and before the * pattern. If not, leave false and the original string. * "s-separate" */ FCode (dw_s_separate) { /* p4_search() uses char, not p4char */ const char *p1 = (const char *) SP[3]; p4cell l1 = SP[2], l2 = *SP; const char *p = p4_search (p1, l1, (char *)SP[1], l2); if (p == NULL) *++SP = 0; else { *--SP = ~0; SP[1] = p - p1; SP[2] = (p4cell) p1; SP[3] = p1 - p + l1 - l2; SP[4] = (p4cell) (p + l2); } } /** * S-AFTER ( s s' -- after.s ) * * Assume s' to be a substring of s. Leave after.s, the * substring of s that follows s'. "s-after" */ FCode (dw_s_after) { p4cell len = SP[2], lenp = SP[0]; p4char *addr = (p4char*) SP[3], *addrp = (p4char*) SP[1]; SP += 2; SP[0] = len - lenp - (addrp - addr); SP[1] = (p4cell) (addrp + lenp); } /** * FIRST-WORD ( s -- word.s | right.empty.s ) * * Leave the first word in s. If there is none, leave the right * empty string. "first-word" */ FCode (dw_first_word) { p4ucell l = (p4ucell) SP[0], lrem; p4char *p = (p4char *) SP[1]; while ( l && ISWHITE (*p) ) { l--; p++; } SP[1] = (p4cell) p; lrem = l; while ( l && !ISWHITE (*p) ) { l--; p++; } SP[0] = lrem - l; } /** * SEPARATE-WORD ( s -- after.s word.s ) * * Leave word.s as a substring of s, the first word in s. Leave * after.s as the substring of s that follows the word, after * skipping the first of any whitespace characters that trail * the word. If after.s or word.s is empty, it is the right * empty string. "separate-word" */ FCode (dw_separate_word) { p4ucell l = (p4ucell) SP[0], lrem; p4char *p = (p4char *) SP[1]; SP -= 2; while ( l && ISWHITE (*p) ) { l--; p++; } SP[1] = (p4cell) p; lrem = l; while ( l && !ISWHITE (*p) ) { l--; p++; } SP[0] = lrem - l; if ( l != 0 ) { p++; l--; } SP[2] = l; SP[3] = (p4cell) p; } /**************** 2 Input Stream ****************/ /**************** 2.1 Parse Area ****************/ _export void dw_parse_area (const p4_char_t **p, p4cell *n) { p4_source (p, n); *p += TO_IN; *n -= TO_IN; } /** * PARSE-AREA@ ( -- unparsed.s ) * * Leave the parse area portion of the input buffer as a string. * Factored from Bernd Paysan's version of Frederick Warren's * $>, comp.lang.forth, March, 2000. Named by Michael Gassanenko. * "parse-area-fetch" * * : parse-area@ ( -- unparsed.s ) * source ( 'inbuf #inbuf) >in @ ( #parsed) /string ; */ FCode (dw_parse_area_fetch) { p4cell n; const p4_char_t *p; SP -= 2; dw_parse_area (&p,&n ); SP[0] = n; SP[1] = (p4cell) p; } /** * PARSE-AREA! ( unparsed.s -- ) * * The unparsed.s string is assumed to lie at the end of the * input buffer. Advance the input stream so that unparsed.s is * the parse area. Named by Michael Gassanenko. * "parse-area-store" * * : parse-area! ( unparsed.s -- ) * ( addr len) source nip ( #tib) swap - >in ! ( addr) drop ; */ FCode (dw_parse_area_store) { TO_IN = NUMBER_TIB - *SP; SP += 2; } /** * EMPTY-PARSE-AREA ( -- ) * * Exhaust the input buffer. "empty-parse-area" * * Note: REFILL can't always be used for this. When the source * is a string, REFILL does nothing. When it is a file, one may * not want the current input buffer to be written over. * * : empty-parse-area ( -- ) * source ( len) >in ! ( addr) drop ; */ FCode (dw_empty_parse_area) { TO_IN = NUMBER_TIB ; } /** * PARSE-AREA-EMPTY? ( -- flag ) */ FCode (dw_parse_area_empty_Q) { *--SP = ( TO_IN == NUMBER_TIB ) ? ~0 : 0 ; } #if 0 /** * PARSE-NAME ( "" -- word.s ) CORE EXT * * Leave the next word in the parse area, and advance the input * stream to just after the first of any trailing whitespace * characters. If there is no word in the parse area, word.s is * the right empty string, and the parse area is left exhausted. */ FCode (dw_parse_name) { p4char *p = (p4char *) (TIB + TO_IN); p4ucell l = NUMBER_TIB - TO_IN, lrem; SP -= 2; while ( l && ISWHITE (*p) ) { l--; p++; } SP[1] = (p4cell) p; lrem = l; while ( l && !ISWHITE (*p) ) { l--; p++; } SP[0] = lrem - l; TO_IN = ( l-- > 0 ) ? NUMBER_TIB - l : NUMBER_TIB ; } #endif /** * PREPARSE-NAME ( "" -- word.s ) * * Get the next word without advancing the input stream. Leo * Wong's PREPARSE. */ FCode (dw_preparse_name) { p4char *p = (p4char *) TIB + TO_IN; p4ucell l = NUMBER_TIB - TO_IN, lrem; SP -= 2; while ( l && ISWHITE (*p) ) { l--; p++; } SP[1] = (p4cell) p; lrem = l; while ( l && !ISWHITE (*p) ) { l--; p++; } SP[0] = lrem - l; } /** * PARSE-NAME-AWAY ( "" -- ) * * When the parse area is not all white space, parse away the * next word and the first of any trailing whitespace * characters. Otherwise empty the parse area. A logical * companion to PREPARSE-NAME. */ FCode (dw_parse_name_away) { p4char *p = (p4char *) (TIB + TO_IN); p4ucell l = NUMBER_TIB - TO_IN; while ( l && isspace (*p) ) { l--; p++; } while ( l && !isspace (*p) ) { l--; p++; } TO_IN = ( l-- > 0 ) ? NUMBER_TIB - l : NUMBER_TIB ; } #define TO_IN_PP if (TO_IN < NUMBER_TIB) ++TO_IN /** >IN++ ( -- ) * * Step the input stream one character forward, unless it is * exhausted. "to-in-increment" */ FCode (dw_to_in_increment) { TO_IN_PP; } /** * >IN-- ( -- ) * * Step the input stream one character back, unless it is * positioned at its start. "to-in-decrement" */ FCode (dw_to_in_decrement) { if (TO_IN) --TO_IN; } /** * {}PARSE ( 'delims -- parsed.s found.char ) * * Scan the current SOURCE buffer for any of the null-terminated * sequence of characters at 'delims as delimiter, beginning at * the current position in the buffer. If a delimiting * character is found, leave it as found.char. If a delimiting * character is not found, leave zero as found.char. * * If a delimiting character is found, the input stream is left * positioned just after it, and parsed.s is the initially * unparsed part of the input stream up to just before * found.char. * * If a delimiting character is not found, the input stream is * left exhausted, and parsed.s is the initially unparsed part * of the input stream. "set-parse" */ FCode (dw_set_parse) /* IMPROVE!! */ { /* d = 0 to stop warning: 'd' may be used uninitialized */ p4char c, d = 0, *dels = (p4char *) *SP; const p4char *p; p4cell i0 = TO_IN, i, j, n, found = 0; SP -= 2; p4_source (&p, &n); p += i0; SP[2] = (p4ucell) p; for (i = i0; i < n && found == 0; i++) { c = *p++; j = 0; while ((d = dels[j++]) != 0) { if (c == d || (d == ' ' && c < ' ')) { found = 1; break; } } } TO_IN = i; if (found) { *SP = d; SP[1] = i - i0 - 1; } else { *SP = 0; SP[1] = i - i0; } } /** * SPACE|PUNCT-PARSE ( "" -- parsed.s found.char|0 ) * * Scan the current SOURCE buffer for a whitespace or any * printing character except a letter or digit, beginning at the * current position in the buffer. If found, leave it as * found.char, with blank substituted for any whitespace * character, else leave zero. * * If one of the characters is found, the input stream is left * positioned just after it, and parsed.s is the initially * unparsed part of the input stream up to just before * found.char. * * If none is found, the input stream is left exhausted, and * parsed.s is the initially unparsed part of the input stream. * "space-or-punct-parse" */ FCode (dw_space_or_punct_parse) { p4char c, found = 0; const p4char *p; p4cell i0 = TO_IN, i, n; int spacefl = 0; SP -= 3; p4_source (&p, &n); p += i0; SP[2] = (p4ucell) p; for (i = i0; i < n; i++) { c = *p++; if ( ispunct (c) || (spacefl = ISWHITE (c)) ) { found = spacefl ? ' ' : c; break; } } TO_IN = found ? i + 1 : i; SP[1] = i - i0; *SP = (p4cell) found; } #if 0 /** * {}SKIP-PARSE ( 'delims -- ) * * Leave the input stream for the current SOURCE buffer * positioned just after leading instances of any of the * null-terminated sequence of characters at 'delims, starting * at the current position. If the input stream has only * leading delimiters, it is left exhausted. */ FCode (dw_set_skip_parse) { } #endif /**************************** 2.2 Across Lines or Blocks ****************************/ /** * "Input stream" refers to the input source across lines. * * With the exception of \\, all across-line words in this * section echo CR's when the lines are being entered * interactively at the terminal. That feature has been tested * by copying and pasting the file parsing-test.fs into a * terminal window in which pfe is running. See parsing-test.fs * for an explanation. */ /** * \\ ( "" -- ) 2002 * * Since REFILL [as well as p4_refill()] does nothing but return * false when the source is a string, the following covers the * string case, too. It is of no use when the input source is * the terminal. */ FCode (dw_backslash_backslash) { TO_IN = NUMBER_TIB ; while ( p4_refill () ) ; } /** * ?EMIT-CR ( -- ) * * For words that parse or collect text across lines and need to * echo a CR when the input is the terminal. */ FCode (dw_Q_emit_cr) { if ( !SOURCE_ID ) FX (p4_cr); } /** * NEXT-INSTREAM-NAME ( "" -- word.s ) \ Wil Baden's NEXT-WORD * * Leave the next word in the input stream, and advance the * input stream to just after the first of any trailing * whitespace characters. If word.s is empty, at most * whitespace was found. A parsing implementation of Wil * Baden's NEXT-WORD, plus conditional echoing of CR's. */ FCode (dw_next_instream_name) #if 0 { const p4char *p = (TIB + TO_IN), *q; p4cell l = NUMBER_TIB - TO_IN, lrem = l; int eof = 0; while ( !eof ) { while ( l && ISWHITE (*p) ) { l--; p++; } q = p; /* p is name pointer candidate */ lrem = l; while ( l && !ISWHITE (*q) ) { l--; q++; } TO_IN = NUMBER_TIB - l; l = lrem - l; if ( l ) break; FX ( dw_Q_emit_cr ); eof = !p4_refill (); p4_source (&p, &l); } SP -= 2; if ( !eof) { SP[0] = l; SP[1] = (p4cell) p; } else { SP[0] = 0; SP[1] = (p4cell) (p + l); } } #else { const p4char *p = (TIB + TO_IN), *q; p4cell l = NUMBER_TIB - TO_IN, lrem = l; int eof = 0; while ( !eof ) { while ( l && ISWHITE (*p) ) { l--; p++; } q = p; /* p is name pointer candidate */ lrem = l; while ( l && !ISWHITE (*q) ) { l--; q++; } TO_IN = ( l > 1 ) ? NUMBER_TIB - l + 1 : NUMBER_TIB ; l = lrem - l; if ( l ) break; FX ( dw_Q_emit_cr ); eof = !p4_refill (); p4_source (&p, &l); } SP -= 2; if ( !eof) { SP[0] = l; SP[1] = (p4cell) p; } else { SP[0] = 0; SP[1] = (p4cell) (p + l); } } #endif _export int dw_s_scan_input (const p4_char_t *addr, p4cell len) { int eof=0, pa_len; const p4_char_t *pa_addr; p4_char_t *q; dw_parse_area (&pa_addr, &pa_len); while ( !eof && !( q = (p4_char_t *) p4_search ( (const char *) pa_addr, pa_len, (const char *) addr, len ) ) ) { FX ( dw_Q_emit_cr ); eof = !p4_refill (); p4_source (&pa_addr, &pa_len); } if (q) TO_IN = q - TIB; else TO_IN = NUMBER_TIB; return !eof; } /** * S-SEEK-INSTREAM ( pat.s -- flag ) * * Assume pat.s to be nonempty. Advance the input stream to * just after the next occurrence of the string, and leave true * if found. If the string is not found before the end of the * file, leave the input stream positioned there with false. */ FCode (dw_s_seek_instream) { p4cell len = *SP++; const p4_char_t *p = (p4_char_t *) *SP; if ( dw_s_scan_input (p, len) ) { TO_IN += len; *SP = ~0; } else *SP = 0 ; } _export int dw_white_delimited ( const p4_char_t *p, p4cell len, const p4_char_t *cbuf, p4cell cbufsz ) { int left, right; if ( !( left = (p == cbuf) ) ) left = WHITE_Q (*(p-1)); cbuf += cbufsz; p += len; if ( !( right = (p == cbuf) ) ) right = WHITE_Q (*p); return left && right ; } /** * WHITE-DELIMITED? ( s cbuf #cbuf -- flag ) * * Assume s to be a nonempty substring of the characters in * the buffer. If the beginning or end of s is either at the * beginning or end of the buffer, or is preceded or followed, * respectively, by a whitespace character in the buffer, leave * true. Else leave false. */ FCode (dw_white_delimited_Q) { int flag = dw_white_delimited ( (const p4_char_t *)SP[3], SP[2], (const p4_char_t *)SP[1], SP[0] ); SP += 3; *SP = flag ? ~0 : 0 ; } /** * |S|-SEEK-INSTREAM ( pat.s -- flag ) * * Assume pat.s to be nonempty. Assume that s is nonempty. * Advance the input stream to just after the next occurrence of * the string, matching embedded whitespace characters exactly, * delimited by whitespace on both ends; and leave true if * found. If the delimited string is not found after REFILL * attempts, empty the parse area and leave false. */ FCode (dw_bar_s_bar_seek_instream) { p4cell len = *SP++, pa_len, src_len; const p4_char_t *addr = (p4_char_t *) *SP, *pa_addr, *src_addr; while ( dw_s_scan_input (addr, len) ) { dw_parse_area (&pa_addr, &pa_len); p4_source (&src_addr, &src_len); if ( dw_white_delimited ( pa_addr, len, src_addr, src_len) ) { TO_IN += len; *SP = ~0; return; } TO_IN_PP; } *SP = 0; } /** * (* ( -- ) * * Equivalent of * * s" *)" |s|-input-past drop ; immediate * * Proposed for ANS Forth 20xx. */ FCode (dw_parens_star) { SP -= 2; SP[0] = 2; SP[1] = (p4ucell) &"*)"; FX ( dw_bar_s_bar_seek_instream ); if ( !*SP++ ) p4_abortq ("***Missing *) terminator"); } #if 0 /** * --- ( -- ) * * Equivalent of * * s" ---" |s|-input-past drop ; immediate */ FCode (dw_3_dashes) { SP -= 2; SP[0] = 3; SP[1] = (p4ucell) &"---"; FX ( dw_bar_s_bar_seek_instream ); SP++; } #endif #if 0 /** * (( ( -- ) * * Equivalent of * * s" ))" |s|-input-after drop ; immediate */ FCode (dw_parens_parens) { SP -= 2; SP[0] = 2; SP[1] = (p4ucell) &"))"; FX ( dw_bar_s_bar_seek_instream ); SP++; } #endif /******* Words *******/ P4_LISTWORDS (parsing) = { /* Lib identifier */ P4_FXco ("PARSING-LIB-S", dw_parsing_lib_s), /* 1.1 ToolBelt 2002 */ P4_FXco ("STRING/", dw_string_slash), P4_FXco ("END-C@", dw_end_c_fetch), P4_FXco ("SKIP", dw_skip), P4_FXco ("SCAN", dw_scan), P4_FXco ("SCAN-BACK", dw_scan_back), P4_FXco ("BACK", dw_scan_back), P4_FXco ("WHITE?", dw_white_Q), P4_FXco ("TRIM\x7c", dw_trim_bar), P4_xOLD ("TRIM", "TRIM\x7c"), /* 1.2 ToolBelt 2000 */ P4_FXco ("/SPLIT", dw_cut_split), P4_FXco ("S-STARTS", dw_s_starts), P4_xOLD ("STARTS?", "S-STARTS"), P4_FXco ("S-STARTS-NC", dw_s_starts_nc), P4_xOLD ("STARTS-NC?", "S-STARTS-NC"), P4_FXco ("S-ENDS", dw_s_ends), P4_xOLD ("ENDS?", "S-ENDS"), P4_FXco ("S-ENDS-NC", dw_s_ends_nc), P4_xOLD ("ENDS-NC?", "S-ENDS-NC"), P4_xOLD ("BL-SKIP", "SKIP"), P4_xOLD ("BL-SCAN", "SCAN"), /* 1.3 Other String Extensions */ P4_FXco ("CUT-FIRST", p4_slash_string), P4_FXco ("CUT-LAST", p4_minus), P4_FXco ("KEEP-FIRST", p4_nip), P4_FXco ("KEEP-LAST", dw_string_slash), P4_FXco ("SKIP-BACK", dw_skip_back), P4_FXco ("\x7cTRIM", dw_bar_trim), P4_FXco ("\x7cTRIM\x7c", dw_bar_trim_bar), P4_FXco ("SEEK", dw_seek), P4_FXco ("{}IN", dw_set_in), P4_xOLD ("{}?", "{}IN"), /* P4_FXco ("{}SKIP", dw_set_skip), */ P4_FXco ("{}SCAN", dw_set_scan), P4_FXco ("{}SEEK", dw_set_seek), P4_xOLD ("{}SCAN-PAST", "{}SEEK"), P4_FXco ("{}INDEX", dw_set_index), P4_FXco ("{}STARTS", dw_set_starts), P4_FXco ("{}ENDS", dw_set_ends), P4_FXco ("SEPARATE", dw_separate), P4_FXco ("{}SEPARATE", dw_set_separate), P4_FXco ("S-SEPARATE", dw_s_separate), P4_FXco ("S-AFTER", dw_s_after), P4_xOLD ("SAFTER", "S-AFTER"), /* P4_FXco ("S-SKIP", dw_s_skip), */ /* P4_FXco ("S-SCAN", dw_s_scan), */ /* P4_FXco ("S-SEEK", dw_s_seek), */ P4_FXco ("FIRST-WORD", dw_first_word), P4_FXco ("SEPARATE-WORD", dw_separate_word), /* 2.1 Parse Area */ P4_FXco ("PARSE-AREA@", dw_parse_area_fetch), P4_FXco ("PARSE-AREA!", dw_parse_area_store), P4_FXco ("EMPTY-PARSE-AREA", dw_empty_parse_area), P4_FXco ("PARSE-AREA-EMPTY?", dw_parse_area_empty_Q), P4_xOLD ("EXHAUST-INPUT", "EMPTY-PARSE-AREA"), P4_FXco (">IN++", dw_to_in_increment), P4_FXco (">IN--", dw_to_in_decrement), #if 0 P4_FXco ("PARSE-NAME", dw_parse_name), #endif P4_FXco ("PREPARSE-NAME", dw_preparse_name), P4_FXco ("PARSE-NAME-AWAY", dw_parse_name_away), P4_FXco ("{}PARSE", dw_set_parse), P4_FXco ("SPACE|PUNCT-PARSE", dw_space_or_punct_parse), /* 2.2 Across Lines or Blocks */ P4_FXco ("\\\\", dw_backslash_backslash), P4_FXco ("?EMIT-CR", dw_Q_emit_cr), P4_FXco ("NEXT-INSTREAM-NAME", dw_next_instream_name), /* P4_FXco ("{}SKIP-INSTREAM", dw_set_skip_instream), */ /* P4_FXco ("{}SCAN-INSTREAM", dw_set_scan_instream), */ /* P4_FXco ("{}SEEK-INPUTSTREAM", dw_set_seek_inputstream), */ P4_FXco ("S-SEEK-INSTREAM", dw_s_seek_instream), P4_xOLD ("S-INPUT-PAST", "S-SEEK-INSTREAM"), P4_FXco ("WHITE-DELIMITED?", dw_white_delimited_Q), P4_FXco ("\x7cS\x7c-SEEK-INSTREAM", dw_bar_s_bar_seek_instream), P4_IXco ("(*", dw_parens_star), /* P4_IXco ("((", dw_parens_parens), */ /* P4_IXco ("---", dw_3_dashes), */ }; P4_COUNTWORDS (parsing, "parsing primitives");