Basic lexer in C

selavy · Dec 15, 2016 · 4d1e224 · 4d1e224
1 parent 1c008f1
commit 4d1e224
Show file tree

Hide file tree

Showing 4 changed files with 232 additions and 1 deletion.
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@ CC=gcc
 RELEASE=-O3 -fstrict-aliasing -flto
 DEBUG=-g -O0
 CFLAGS=-Wall -Werror -pedantic $(DEBUG) -std=c11
-OBJS = main.o
+OBJS = lexer.o main.o
 TARGET=interpreter
 
 $(TARGET): $(OBJS)

diff --git a/lexer.c b/lexer.c
@@ -0,0 +1,124 @@
+#include "lexer.h"
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+
+const char *token_to_string(enum TokenType token) {
+    switch (token) {
+#define F(T) case T: return #T;
+        FOREACH_TOKEN(F)
+#undef F
+        default: return "unknown";
+    }
+}
+
+int lexer_create(struct Lexer *lex, const char *beg, const char *end) {
+    lex->cur = beg;
+    lex->end = end;
+    return 0;
+}
+
+int lexer_destroy(struct Lexer *lex) {
+    return 0;
+}
+
+int is_letter(char c) {
+    return isalpha(c) || c == '_';
+}
+
+enum TokenType lookup_ident(const char *beg, const char *end) {
+    int len = end - beg;
+    if (beg >= end) {
+        return Illegal;
+    }
+    //printf("Checking ident: '%.*s'\n", len, beg);
+
+    // REVISIT(plesslie): would ideally use a perfect hash generator like gperf here
+    if (strncmp(beg, "fn", len) == 0) {
+        return Function;
+    } else if (strncmp(beg, "let", len) == 0) {
+        return Let;
+    } else if (strncmp(beg, "true", len) == 0) {
+        return True;
+    } else if (strncmp(beg, "false", len) == 0) {
+        return False;
+    } else if (strncmp(beg, "if", len) == 0) {
+        return If;
+    } else if (strncmp(beg, "else", len) == 0) {
+        return Else;
+    } else if (strncmp(beg, "return", len) == 0) {
+        return Return;
+    } else {
+        return Ident;
+    }
+}
+
+int lexer_next_token(struct Lexer *lex, struct Token *token) {
+    char c;
+
+    // skip whitespace
+    while (lex->cur < lex->end && isspace(*lex->cur)) {
+        ++lex->cur;
+    }
+
+    if (lex->cur >= lex->end) {
+        token->type = EndOfFile;
+        return 0;
+    }
+
+    c = *lex->cur++;
+    if (c == '=') {
+        if (lex->cur < lex->end && *lex->cur == '=') {
+            ++lex->cur;
+            token->type = Equal;
+        } else {
+            token->type = Assign;
+        }
+    } else if (c == '!') {
+        if (lex->cur < lex->end && *lex->cur == '=') {
+            ++lex->cur;
+            token->type = NotEqual;
+        } else {
+            token->type = Bang;
+        }
+    } else if (isdigit(c)) { // should this be checking for '.' as well?
+        token->type = Integer;
+        // read_number()
+        token->beg = lex->cur - 1;
+        while (lex->cur < lex->end && isdigit(*lex->cur)) {
+            ++lex->cur;
+        }
+        token->end = lex->cur;
+    } else if (is_letter(c)) {
+        // read_ident()
+        token->beg = lex->cur - 1;
+        while (lex->cur < lex->end && is_letter(*lex->cur)) {
+            ++lex->cur;
+        }
+        token->end = lex->cur;
+        token->type = lookup_ident(token->beg, token->end);
+    } else {
+        switch (c) {
+            case '+': token->type = Plus; break;
+            case '-': token->type = Minus; break;
+            case '/': token->type = Slash; break;
+            case '*': token->type = Asterisk; break;
+            case '<': token->type = LowerThan; break;
+            case '>': token->type = GreaterThan; break;
+            case ';': token->type = Semicolon; break;
+            case ',': token->type = Comma; break;
+            case '{': token->type = LeftBrace; break;
+            case '}': token->type = RightBrace; break;
+            case '(': token->type = LeftParenthesis; break;
+            case ')': token->type = RightParenthesis; break;
+            default:
+                      // case '=' or '=='
+                      // case '!' or '!='
+                      // is_letter
+                      // is_numeric
+                      return 1;
+        }
+    }
+
+    return 0;
+}
diff --git a/lexer.h b/lexer.h
@@ -0,0 +1,56 @@
+#ifndef LEXER__H_
+#define LEXER__H_
+
+
+#define FOREACH_TOKEN(F) \
+    F(Illegal) \
+    F(EndOfFile) \
+    F(Ident) \
+    F(Integer) \
+    F(Assign) \
+    F(Plus) \
+    F(Minus) \
+    F(Bang) \
+    F(Asterisk) \
+    F(Slash) \
+    F(LowerThan) \
+    F(GreaterThan) \
+    F(Equal) \
+    F(NotEqual) \
+    F(Comma) \
+    F(Semicolon) \
+    F(LeftParenthesis) \
+    F(RightParenthesis) \
+    F(LeftBrace) \
+    F(RightBrace) \
+    F(Function) \
+    F(Let) \
+    F(True) \
+    F(False) \
+    F(If) \
+    F(Else) \
+    F(Return) \
+
+enum TokenType {
+    #define F(token) token,
+    FOREACH_TOKEN(F)
+    #undef F
+};
+
+extern const char *token_to_string(enum TokenType token);
+
+struct Token {
+    enum TokenType type;
+    const char *beg;
+    const char *end;
+};
+
+struct Lexer {
+    const char *cur;
+    const char *end;
+};
+int lexer_create(struct Lexer *lex, const char *beg, const char *end);
+int lexer_destroy(struct Lexer *lex);
+int lexer_next_token(struct Lexer *lex, struct Token *token);
+
+#endif // LEXER__H_
diff --git a/main.c b/main.c
@@ -1,5 +1,56 @@
+#define _GNU_SOURCE
 #include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "lexer.h"
 
 int main(int argc, char **argv) {
+    FILE *stream;
+    char *line = NULL;
+    size_t len = 0;
+    ssize_t read;
+    struct Lexer lexer;
+    struct Token token;
+
+    stream = fdopen(STDIN_FILENO, "rb");
+    if (!stream) {
+        exit(EXIT_FAILURE);
+    }
+
+    setbuf(stdout, 0);
+    while(1) {
+        printf(">> ");
+        read = getline(&line, &len, stream);
+        if (read > 0) {
+            if (lexer_create(&lexer, line, line+read) != 0) {
+                fprintf(stderr, "Failed to create lexer!\n");
+                exit(EXIT_FAILURE);
+            }
+
+            // printf("Retrieved line of length %zu :\n", read);
+            // printf("%s", line);
+
+            do {
+                if (lexer_next_token(&lexer, &token) != 0) {
+                    fprintf(stderr, "Lexer failed to get next Token!\n");
+                    break;
+                }
+                printf("%s\n", token_to_string(token.type));
+            } while (token.type != EndOfFile);
+
+            if (lexer_destroy(&lexer) != 0) {
+                fprintf(stderr, "Failed to destroy lexer!\n");
+                exit(EXIT_FAILURE);
+            }
+        } else {
+            break;
+        }
+    }
+
+    free(line);
+    fclose(stream);
+    exit(EXIT_SUCCESS);
+
     return 0;
 }