mdhender

super secret hq

Single Line Comments

Before starting the parser, let’s modify the lexer to accept comments.

For our purposes, a comment is two semi-colons followed by all the text up to (but not including) the new-line character.

gramar  = ( comment | WHITESPACE | word )* .
comment = ";;" [^\n]* .
word    = ( QUOTEDTEXT | UNQUOTEDTEXT ) .

It’s not much work to add comments. We add a new tag so that we can tell the driver what we found. We add a section of code that looks for the ;; and sets up the lexeme.

typedef enum {
    lexkEndOfInput, lexkError, lexkSpace
  , lexkQuotedText, lexkUnquotedText
  , lexkComment
} LEXKIND;

struct LEXEME {
  LEXKIND     kind;
  int         line;
  const char *first;
  const char *last;
};
typedef struct LEXEME LEXEME;

// scans input for words and whitespace.
//
// whitespace is anything that the C "ispace" function accepts.
//
// words are everything else.
//
// there are two types of words, quoted text and unquoted text.
//
// side-effects
//   updates the input LEXEME with information on the lexeme that was scanned
//     kind == lexkEndOfInput if input is exhausted
//     kind == lexkError if there are problems with the input
//     otherwise, kind is set to the type of word
//
void ForthLexer(const char *input, LEXEME *lexeme) {
  // needless sanity check for valid input
  //
  if (!lexeme) {
      return;
  }

  // if nothing to parse, signal end of input.
  //
  if (!input || !*input) {
      lexeme->kind  = lexkEndOfInput;
      lexeme->first = 0;
      lexeme->last  = 0;
      return;
  }

  // lexeme->first points to the first character in the text
  //
  lexeme->first = input++;

  // scan whitespace. ends at the first non-space character or end of string.
  //
  if (isspace(*(lexeme->first)) {
      lexeme->kind = lexkSpace;

      while (isspace(*input)) {
          if (*input == '\n') {
              lexeme->line++;
          }
          input++;
      }

      // lexeme->last points at the last space character
      //
      lexeme->last = input - 1;
      return;
  }

  // scan comments. starts with ";;" and ends at the first new-line
  // or end of input.
  //
  if (*(lexeme->first) == ';' && *(lexeme->first + 1) == ';') {
      lexeme->kind = lexkComment;

      while (*input && *input != '\n') {
          input++;
      }

      // lexeme->last points at character before the end of line
      // or end of input
      //
      lexeme->last = input - 1;

      return;
  }

  // scan quoted text. ends at the quote delimiter or end of string.
  // two consecutive quotes are considered to be an escape and do not
  // count as a quote delimiter.
  //
  if (*(lexeme->first) == '"' || *(lexeme->first) == '\'') {
      lexeme->kind = lexkQuotedText;

      char quote = *(lexeme->first);

      while (*input) {
          if (*input == quote) {
              // end of quoted text?
              if (*(input + 1) != quote) {
                  break;
              }
              // escaped quote, advance past it
              input++;
          } else if (*input == '\n') {
              lexeme->line++;
          }
          input++;
      }

      // return an error if we found unterminated quoted text
      //
      if (*input != quote) {
          lexeme->kind = lexkError;
          lexeme->last = input - 1;
          return;
      }

      // lexeme->last points at the closing quote mark.
      //
      lexeme->last = input++;

      return;
  }

  // scan unquoted text. ends at the first space character or end of string.
  //
  lexeme->kind = lexkUnquotedText;

  while (!isspace(*input)) {
      input++;
  }

  // lexeme->last points at the last character in the unquoted text
  //
  lexeme->last = input - 1;

  return;
}

A Note on Whitespace And Comments

A good lexer will return every terminal that is specified in the grammar.

We added comments and whitespace to the grammar to give us a chance to push the code around a bit. In normal practice, though, the grammar would not contain those items because they complicate the grammar. In our case, we’d expect to see a grammar like:

gramar = ( QUOTEDTEXT | UNQUOTEDTEXT )* .

The lexer would be coded to recognize the comments and whitespace, but not to return them to the driver. The only reason to return them would be if the parser needed to use them.

An example would be a tool that generates documentation from specially formatted comments. Something like “rdoc.”

We won’t be going down that path, so we will strip the comments and whitespace from our grammar and update the lexer to match.

typedef enum {
    lexkEndOfInput, lexkError
  , lexkQuotedText, lexkUnquotedText
} LEXKIND;

struct LEXEME {
  LEXKIND     kind;
  int         line;
  const char *first;
  const char *last;
};
typedef struct LEXEME LEXEME;

// scans input for words. ignores comments and whitespace.
//
// whitespace is anything that the C "ispace" function accepts.
//
// comments are ";;" to the end of the line.
//
// words are everything else.
//
// there are two types of words, quoted text and unquoted text.
//
// side-effects
//   updates the input LEXEME with information on the lexeme that was scanned
//     kind == lexkEndOfInput if input is exhausted
//     kind == lexkError if there are problems with the input
//     otherwise, kind is set to the type of word
//
void ForthLexer(const char *input, LEXEME *lexeme) {
  // needless sanity check for valid input
  //
  if (!lexeme) {
      return;
  } else if (!input) {
      // nothing to parse, so signal end of input.
      //
      lexeme->kind  = lexkEndOfInput;
      lexeme->first = 0;
      lexeme->last  = 0;
      return;
  }

  // ignore items such as comments and whitespace
  //
  int itemsToIgnore;
  do {
      itemsToIgnore = 0;

      // scan whitespace. ends at the first non-space character or end of input.
      //
      if (isspace(*input) {
          itemsToIgnore++;

          while (isspace(*input)) {
              if (*input == '\n') {
                  lexeme->line++;
              }
              input++;
          }
      }

      // scan comments. starts with ";;" and ends at the first new-line
      // or end of input.
      //
      if (*input == ';' && *(input + 1) == ';') {
          itemsToIgnore++;

          while (*input && *input != '\n') {
              input++;
          }
      }
  } while (itemsToIgnore);

  // if nothing to parse, signal end of input.
  //
  if (!input || !*input) {
      lexeme->kind  = lexkEndOfInput;
      lexeme->first = 0;
      lexeme->last  = 0;
      return;
  }


  // lexeme->first points to the first character in the text
  //
  lexeme->first = input++;

  // scan quoted text. ends at the quote delimiter or end of input.
  // two consecutive quotes are considered to be an escape and do not
  // count as a quote delimiter.
  //
  if (*(lexeme->first) == '"' || *(lexeme->first) == '\'') {
      lexeme->kind = lexkQuotedText;

      char quote = *(lexeme->first);

      while (*input) {
          if (*input == quote) {
              // end of quoted text?
              if (*(input + 1) != quote) {
                  break;
              }
              // escaped quote, advance past it
              input++;
          } else if (*input == '\n') {
              lexeme->line++;
          }
          input++;
      }

      // return an error if we found unterminated quoted text
      //
      if (*input != quote) {
          lexeme->kind = lexkError;
          lexeme->last = input - 1;
          return;
      }

      // lexeme->last points at the closing quote mark.
      //
      lexeme->last = input++;

      return;
  }

  // scan unquoted text. ends at the first space character or end of input.
  //
  lexeme->kind = lexkUnquotedText;

  while (!isspace(*input)) {
      input++;
  }

  // lexeme->last points at the last character in the unquoted text
  //
  lexeme->last = input - 1;

  return;
}

Comments