mdhender

super secret hq

A Better Forth Scanner

This scanner is more robust. It detects errors with unterminated quoted text. It also recognizes escapes inside the quoted text. It doesn’t do anything with the escapes because it’s just a scanner.

If you compare the code to the prior version, you should notice that I changed the name from “scanner” to “lexer.” I made the change to more accurately reflect what the code should be doing.

I also added in a “line” variable to track the current line in the input buffer. That’s useful to know when displaying error messages.

typedef enum {
    lexkEndOfInput, lexkError, lexkSpace
  , lexkQuotedText, lexkUnquotedText
} LEXKIND;

struct LEXEME {
  LEXKIND     kind;
  int         line;
  const char *first;
  const char *last;
};
typedef struct LEXEME LEXEME;

// scans input for words and whitespace.
//
// whitespace is anything that the C "ispace" function accepts.
//
// words are everything else.
//
// there are two types of words, quoted text and unquoted text.
//
// side-effects
//   updates the input LEXEME with information on the lexeme that was scanned
//     kind == lexkEndOfInput if input is exhausted
//     kind == lexkError if there are problems with the input
//     otherwise, kind is set to the type of word
//
void ForthLexer(const char *input, LEXEME *lexeme) {
  // needless sanity check for valid input
  //
  if (!lexeme) {
      return;
  }

  // if nothing to parse, signal end of input.
  //
  if (!input || !*input) {
      lexeme->kind  = lexkEndOfInput;
      lexeme->first = 0;
      lexeme->last  = 0;
      return;
  }

  // lexeme->first points to the first character in the text
  //
  lexeme->first = input++;

  // scan whitespace. ends at the first non-space character or end of string.
  //
  if (isspace(*(lexeme->first)) {
      lexeme->kind = lexkSpace;

      while (isspace(*input)) {
          if (*input == '\n') {
              lexeme->line++;
          }
          input++;
      }

      // lexeme->last points at the last space character
      //
      lexeme->last = input - 1;
      return;
  }

  // scan quoted text. ends at the quote delimiter or end of string.
  // two consecutive quotes are considered to be an escape and do not
  // count as a quote delimiter.
  //
  if (*(lexeme->first) == '"' || *(lexeme->first) == '\'') {
      lexeme->kind = lexkQuotedText;

      char quote = *(lexeme->first);

      while (*input) {
          if (*input == quote) {
              // end of quoted text?
              if (*(input + 1) != quote) {
                  break;
              }
              // escaped quote, advance past it
              input++;
          } else if (*input == '\n') {
              lexeme->line++;
          }
          input++;
      }

      // return an error if we found unterminated quoted text
      //
      if (*input != quote) {
          lexeme->kind = lexkError;
          lexeme->last = input - 1;
          return;
      }

      // lexeme->last points at the closing quote mark.
      //
      lexeme->last = input++;

      return;
  }

  // scan unquoted text. ends at the first space character or end of string.
  //
  lexeme->kind = lexkUnquotedText;

  while (!isspace(*input)) {
      input++;
  }

  // lexeme->last points at the last character in the unquoted text
  //
  lexeme->last = input - 1;

  return;
}

The Updated Driver

The driver calls the lexer until the input is exhausted.

bool Driver(const char *inputBuffer) {
  LEXEME lex;
  lex.line = 1;

  do {
      ForthLexer(inputBuffer, &lex);
      if (lex.kind == lexkError) {
          // do something here
          break;
      }
      inputBuffer = lex.last + 1;
  } while (lex.kind != lexkEndOfInput);

  return true;
}

Comments