mdhender

super secret hq

Parsing McCarthy's S-Expressions

This article builds a top-down recursive parser for simple s-expressions. S-expressions are used to represent data and code in LISP. We’re not going to build a full-blown LISP interpreter here, just a parser for s-expressions.

I’m basing this grammar on the LISP 1.5 Programmer’s Manual written by McCarthy, Abrahams, Edwards, Hart & Levin. It accepts both s-expressions and list notation. A s-expression is the basic form in LISP. It consists of atoms and dotted pairs. Here are some examples:

A
(A . B)
(A . (B . C))

List notation simplifies that by removing the dots and extra parentheses. The manual also describes a comma separated version of list notation (eg, (A, B, C)), but we will not be doing that.

A
(A B)
(A B C)

To simplify some of the code in the parser, we’re going to assume that the input always has a token signifying the start and end of the input. We’re going to call these tokens “BOF” and “EOF.” The grammar with those tokens becomes:

grammar = BOF { sexpr } EOF .
sexpr   = atom
        | '(' [ sexpr { sexpr } [ '.' sexpr ] ] ')' .
atom    = NUMBER | SYMBOL .

The Lexer

Our lexer assumes that the input is passed as a single buffer. This simplifies the lexer because we don’t have to worry about tokens spanning a buffer boundary. Of course it costs more in memory because we have to hold the whole buffer in memory.

The lexer uses a single structure to communicate with the scanner.

struct LEXEME {
  int         isValid;
  const char *start;
  const char *end;
};
typedef struct LEXEME LEXEME;

When the lexer finds a token, it sets the flag to valid, points start to the beginning of the token and end to the character just past the end of the token.

The code for the lexer is:

// Lexer(buf)
//   buf   pointer to terminated buffer to extract the next lexeme from.
//         ignores whitespace in the buffer.
//
//         glyphs = '(' | ')' | '.'
//         text   = alpha alnum*
//         number = '-'? digit+ ( '.' digit+ )?
//
LEXEME *Lexer(const char *buf) {
  static LEXEME l;

  // check for valid input buffer
  if (!buf) {
    return 0;
  }

  // skip leading whitespace
  while (isspace(*buf)) {
    buf++;
  }

  // check for end of buffer
  if (*buf == 0) {
    return 0;
  }

  l.start = buf;

  switch (*buf) {
  case '(':
  case ')':
  case '.':
    l.isValid = 1;
    l.end     = buf + 1;
    break;
  default:
    if (IsSymbol(l.start)) {
      l.isValid = 1;
      l.end     = LexSymbol(l.start);
    } else if (IsNumber(l.start)) {
      l.isValid = 1;
      l.end     = LexNumber(l.start);
    } else {
      // invalid word
      l.isValid = 0;
      l.end     = buf + 1;
    }
    break;
  }

  return &l;
}

The Scanner

Our scanner assumes that it is passed the entire input buffer. It builds a linked list of tokens. The parser will take that list of tokens as its input.

The scanner uses the following structure:

struct TOKEN {
  struct TOKEN *prev;
  struct TOKEN *next;
  int           kind;
  char          data[1];
};
typedef struct TOKEN TOKEN;

The variable kind holds the type of token.

#define TOK_INVALID 0x0000
#define TOK_BOF     0x0001
#define TOK_EOF     0x0002
#define TOK_DOT     0x0003
#define TOK_LPAREN  0x0004
#define TOK_RPAREN  0x0005
#define TOK_NUMBER  0x0006
#define TOK_TEXT    0x0007

Remember that our grammar expects the token list to start with BOF and EOF? The scanner will add them in.

// Scanner(inputBuffer)
//   inputBuffer   pointer to nil-terminated text to tokenize
//
TOKEN *Scanner(const char *inputBuffer) {
  TOKEN *root = NewToken(TOK_BOF, 0, 0);
  TOKEN *tail = root;

  LEXEME *l = Lexer(inputBuffer);
  while (l) {
    if (!l->isValid) {
      // handle bad word
      return 0;
    }

    tail->next = NewToken(0, l->start, l->end);
    tail->next->prev = tail;
    tail = tail->next;

    switch (*(l->start)) {
    case '.':
      tail->kind = TOK_DOT;
      break;
    case '(':
      tail->kind = TOK_LPAREN;
      break;
    case ')':
      tail->kind = TOK_RPAREN;
      break;
    default:
      if (isdigit(*(l->start))) {
        tail->kind = TOK_NUMBER;
      } else {
        tail->kind = TOK_TEXT;
      }
      break;
    }

    // start up with next character after the token
    inputBuffer = l->end;
  }

  tail->next = NewToken(TOK_EOF, 0, 0);
  tail->next->prev = tail;

  return root;
}

The Parser

To avoid using globals, we will pass the parser state to every function in the parser. The state is bare-bones right now. It contains only a pointer to the first token (the BOF) and the current token:

struct PARSE_STATE {
    TOKEN *root;
    TOKEN *curr;
};
typedef struct PARSE_STATE PARSE_STATE;

The parser for an atom is:

// atom    = NUMBER | SYMBOL | TEXT .
//
static int PSAtom(PARSE_STATE *ps) {
  if (ps->curr->kind == TOK_NUMBER) {
    ps->curr = ps->curr->next;
    return 1;
  } else if (ps->curr->kind == TOK_TEXT) {
    ps->curr = ps->curr->next;
    return 1;
  }
  return 0;
}

For the moment, we’re treating symbols as text. Later, when we add a symbol table, we’ll fix this logic. The function checks for the valid tokens for an atom, number or text. If either is found, then the parser state is modified by advancing to the next token and we return the value 1 to indicate that we did find an atom. If neither is found, then we return a zero to indicate that.

The parser for an s-expression is:

// sexpr     = atom
//           | '(' [ sexpr { sexpr } [ '.' sexpr ] ] ')'
//
static int PSExpr(PARSE_STATE *ps) {
  if (PSAtom(ps)) {
    return 1;
  } else if (ps->curr->kind == TOK_LPAREN) {
    ps->curr = ps->curr->next;

    if (PSExpr(ps)) {
      do {
        //
      } while (PSExpr(ps));

      if (ps->curr->kind == TOK_DOT) {
        ps->curr = ps->curr->next;
        if (!PSExpr(ps)) {
          return 0;
        }
      }
    }

    if (ps->curr->kind == TOK_RPAREN) {
      ps->curr = ps->curr->next;
      return 1;
    }
  }

  return 0;
}

Right now the parser only recognizes valid s-expressions. Later, we will add code to the placeholders to build a structure for the s-expressions. We will use that structure when building the interpreter.

To use this code, we need a driving function.

LIST_ELEMENT *SExprParse(TOKEN *root) {
  if (!root || root->kind != TOK_BOF) {
    return 0;
  }

  PARSE_STATE ps;
  ps.root = root;
  ps.curr = root->next;

  TOKEN *start = ps.curr;
  while (PSExpr(&ps, 1)) {
    printf("module         found statement\n");
    start = ps.curr;

    // placeholder to interpret the statement
    if (ps.curr->kind == TOK_EOF) {
      // end of the module
      ps.curr = ps.curr->next;
      break;
    }
  }

  if (ps.curr) {
    printf("error:\tinvalid symbol in module\n");
    printf("\t%-18s == %d\n", "kind", ps.curr->kind);
    printf("\t%-18s == '%s'\n", "part of it", ps.curr->data);
  }

  return 0;
}

Support Code

Some miscellaneous code that I didn’t want cluttering up the text above.

static int IsNumber(const char *buf) {
  if (isdigit(*buf)) {
    return 1;
  } else if (*buf == '-' && isdigit(*(buf+1))) {
    return 1;
  }
  return 0;
}

static int IsSymbol(const char *buf) {
  if (isalpha(*buf)) {
    return 1;
  }
  return 0;
}

static const char *LexNumber(const char *buf) {
  if (*buf == '-') {
    buf++;
  }
  while (isdigit(*buf)) {
    buf++;
  }
  if (*buf == '.' && isdigit(*(buf+1))) {
    buf++;
    while (isdigit(*buf)) {
      buf++;
    }
  }
  return buf;
}

static const char *LexSymbol(const char *buf) {
  while (isalpha(*buf)) {
    buf++;
  }
  return buf;
}

static TOKEN *NewToken(int kind, const char *start, const char *end) {
  int len = end - start;
  if (!start || !end || end < start) {
    len = 0;
  }
  TOKEN *t = (TOKEN *)malloc(sizeof(*t) + len);
  if (!t) {
    perror("NewToken");
    exit(2);
  }
  t->prev = t->next = 0;
  t->kind = kind;
  if (len) {
    memcpy(t->data, start, len);
  }
  t->data[len] = 0;
  return t;
}

Comments