From 01f427bc0351323fe9007ef62f185872d3311b52 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 17 Feb 2023 22:23:50 +0200 Subject: [PATCH 01/15] Start working on self-hosted compiler. Hitting bugs in existing compiler... --- self_hosted/errors_and_warnings.jou | 25 ++++ self_hosted/tokenizer.jou | 203 ++++++++++++++++++++++++++++ stdlib/io.jou | 8 ++ stdlib/mem.jou | 1 + 4 files changed, 237 insertions(+) create mode 100644 self_hosted/errors_and_warnings.jou create mode 100644 self_hosted/tokenizer.jou diff --git a/self_hosted/errors_and_warnings.jou b/self_hosted/errors_and_warnings.jou new file mode 100644 index 00000000..663461d9 --- /dev/null +++ b/self_hosted/errors_and_warnings.jou @@ -0,0 +1,25 @@ +from "stdlib/process.jou" import exit +from "stdlib/io.jou" import stdout, stderr, fprintf, fflush + +struct Location: + path: byte* # Not owned. Points to a string that is held elsewhere. + lineno: int + +def fail(location: Location, message: byte*) -> void: + # When stdout is redirected to same place as stderr, + # make sure that normal printf()s show up before our error. + fflush(stdout) + fflush(stderr) + + fprintf(stderr, "compiler error in file \"%s\"", location.path) + if location.lineno != 0: + fprintf(stderr, ", line %d", location.lineno) + fprintf(stderr, ": %s\n", message) + + exit(1) + +# TODO: doesn't really belong here +def assert(b: bool) -> void: + if not b: + fprintf(stderr, "assertion failed\n") + exit(1) diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou new file mode 100644 index 00000000..548cc697 --- /dev/null +++ b/self_hosted/tokenizer.jou @@ -0,0 +1,203 @@ +from "stdlib/io.jou" import printf, FILE, fgetc, ferror, fopen +from "stdlib/mem.jou" import malloc, realloc, free +from "./errors_and_warnings.jou" import Location, fail, assert + +enum TokenKind: + Int + Long + Float + Double + Byte # example: 'a' is 97 as a byte + String + Name + Keyword + Newline + Indent + Dedent + Operator + EndOfFile # Marks the end of an array of tokens. + +struct Token: + kind: TokenKind + location: Location + + # Only one of these is used at a time. + # TODO: union + int_value: int # Int + long_value: long # Long + byte_value: byte # Byte + indentation_level: int # Newline (indicates how many spaces there are after the newline) + short_string: byte[100] # Name, Keyword, Operator + long_string: byte* # String + +declare isprint(b: int) -> int + +def print_token(token: Token*) -> void: + if token->kind == TokenKind::Int: + printf("integer %d\n", token->int_value) + elif token->kind == TokenKind::Long: + printf("long %lld\n", token->long_value) + elif token->kind == TokenKind::Float: + printf("float %s\n", &token->short_string[0]) + elif token->kind == TokenKind::Double: + printf("double %s\n", &token->short_string[0]) + elif token->kind == TokenKind::Byte: + printf("character %#02x", token->byte_value) + if isprint(token->byte_value) != 0: + printf(" '%c'", token->byte_value) + printf("\n") + elif token->kind == TokenKind::EndOfFile: + printf("end of file\n") + else: + printf("????\n") + +struct Tokenizer: + f: FILE* + location: Location + pushback: byte* + pushback_len: int # TODO: dynamic array + # Parens array isn't dynamic, so that you can't segfault + # the compiler by feeding it lots of nested parentheses, + # which would make it recurse too deep. + parens: Token[50] + parens_len: int + +def read_byte(self: Tokenizer*) -> byte: + EOF = -1 # FIXME + + c: byte + if self->pushback_len > 0: + c = self->pushback[--self->pushback_len] + else: + temp = fgetc(self->f) + if temp == '\r': + # On Windows, \r just before \n is ignored. + temp = fgetc(self->f) + if temp != EOF and temp != '\n': + # TODO: test this, if possible? + fail(self->location, "source file contains a CR byte ('\\r') that isn't a part of a CRLF line ending") + + if temp == EOF: + if ferror(self->f) != 0: + # TODO: include errno in the error message + fail(self->location, "cannot read file") + # Use the zero byte to denote end of file. + c = '\0' + elif temp == '\0': + # TODO: test this + fail(self->location, "source file contains a zero byte") + c = 'x' # TODO: silences compiler warning, but never runs + else: + c = temp as byte + + if c == '\n': + self->location.lineno++ + return c + + +def unread_byte(self: Tokenizer*, b: byte) -> void: + if b == '\0': + return + + assert(b != '\r') + self->pushback = realloc(self->pushback, self->pushback_len + 1) + self->pushback[self->pushback_len++] = b + if b == '\n': + self->location.lineno-- + +def is_identifier_or_number_byte(b: byte) -> bool: + return ( + ('A' <= b and b <= 'Z') + or ('a' <= b and b <= 'z') + or ('0' <= b and b <= '9') + or b == '_' + ) + +def read_identifier_or_number(self: Tokenizer*, first_byte: byte) -> byte[100]: + dest: byte[100] + for i = 0; i < 100; i++: # TODO: memset + dest[i] = '\0' + destlen = 0 + + assert(is_identifier_or_number_byte(first_byte)) + dest[destlen++] = first_byte + + while True: + b = read_byte(self) + if is_identifier_or_number_byte(b): + if destlen == sizeof dest - 1: + fail(self->location, "name or number is too long") + dest[destlen++] = b + else: + unread_byte(self, b) + return dest + +def read_token(self: Tokenizer*) -> Token: + while True: + token = Token{location = self->location} + b = read_byte(self) + + if is_identifier_or_number_byte(b): + token.short_string = read_identifier_or_number(self, b) + token.kind = TokenKind::Name + else: + token.kind = TokenKind::EndOfFile + return token + +def tokenize_without_indent_dedent_tokens(file: FILE*, path: byte*) -> Token*: + tokenizer = Tokenizer{ + location = Location{path = path}, + f = file, + } + + # Add a fake newline to the beginning. It does a few things: + # * Less special-casing: blank lines in the beginning of the file can + # cause there to be a newline token anyway. + # * It is easier to detect an unexpected indentation in the beginning + # of the file, as it becomes just like any other indentation. + # * Line numbers start at 1. + tokenizer.pushback = malloc(1) + tokenizer.pushback[0] = '\n' + tokenizer.pushback_len = 1 + + tokens: Token* = NULL + len = 0 + while len == 0 or tokens[len-1].kind != TokenKind::EndOfFile: + tokens = realloc(tokens, sizeof(tokens[0]) * (len+1)) + printf("Size of token = %lld\n", sizeof(tokens[0])) + printf("Just realloced to %lld\n", sizeof(tokens[0])* (len+1)) + printf("base=%p dest=%p\n", tokens, &tokens[len]) + # FIXME: can't do tokens[len++] + t = read_token(&tokenizer) + printf("Asd\n") + printf("base=%p dest=%p\n", tokens, &tokens[len]) + tokens[len] = t + printf("base=%p dest=%p\n", tokens, &tokens[len]) + len++ + + free(tokenizer.pushback) + return tokens + +def tokenize(path: byte*) -> Token*: + file = fopen(path, "rb") + if file == NULL: + # TODO: test this + # TODO: include errno in the message + fail(Location{path=path}, "cannot open file") + raw_tokens = tokenize_without_indent_dedent_tokens(file, path) + # TODO: handle indentations and such + return raw_tokens + + +def main() -> int: + tokens = tokenize("examples/hello.jou") + + t = tokens + while True: + print_token(t) + # TODO: Shouldn't need parentheses. + if (t++)->kind == TokenKind::EndOfFile: + break + + free(tokens) + return 0 diff --git a/stdlib/io.jou b/stdlib/io.jou index 5a13461f..e0bfeb96 100644 --- a/stdlib/io.jou +++ b/stdlib/io.jou @@ -60,6 +60,10 @@ declare fprintf(file: FILE *, pattern: byte*, ...) -> int declare fgetc(file: FILE*) -> int # see getchar() declare fscanf(file: FILE*, pattern: byte*, ...) -> int +# Ensure that output is actually written. It may remain buffered +# without calling this function. +declare fflush(file: FILE*) -> int + # Read a line of text from file into a string starting at the given # pointer. Reading stops at newline character, end of file, on error, # or when the resulting string (including the '\0') wouldn't fit @@ -68,5 +72,9 @@ declare fscanf(file: FILE*, pattern: byte*, ...) -> int # Return value: NULL on error, same as destination on success. declare fgets(destination: byte*, n: int, file: FILE*) -> byte* +# TODO: document +declare feof(file: FILE*) -> int +declare ferror(file: FILE*) -> int + # Move back to beginning of file. declare rewind(file: FILE*) -> void diff --git a/stdlib/mem.jou b/stdlib/mem.jou index 94250f58..e3095506 100644 --- a/stdlib/mem.jou +++ b/stdlib/mem.jou @@ -3,6 +3,7 @@ # Heap allocations # TODO: write a tutorial about using these and add a link declare malloc(size: long) -> void* +declare realloc(ptr: void*, size: long) -> void* declare free(ptr: void*) -> void declare memcpy(dest: void*, source: void*, count: long) -> void* From fcf5f72283df4acd5861aa6ae12020b88880cc7b Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 19:02:11 +0200 Subject: [PATCH 02/15] More working on the self-hosted tokenizer --- self_hosted/tokenizer.jou | 59 ++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou index 548cc697..7a6dff38 100644 --- a/self_hosted/tokenizer.jou +++ b/self_hosted/tokenizer.jou @@ -48,6 +48,10 @@ def print_token(token: Token*) -> void: printf("\n") elif token->kind == TokenKind::EndOfFile: printf("end of file\n") + elif token->kind == TokenKind::Name: + printf("name \"%s\"\n", &token->short_string[0]) + elif token->kind == TokenKind::Newline: + printf("newline, next indent %d\n", token->indentation_level) else: printf("????\n") @@ -132,15 +136,53 @@ def read_identifier_or_number(self: Tokenizer*, first_byte: byte) -> byte[100]: unread_byte(self, b) return dest +def consume_rest_of_line(self: Tokenizer*) -> void: + while True: + c = read_byte(self) + if c == '\0' or c == '\n': + break + +# Returns the indentation level for the next line +def read_newline_token(self: Tokenizer*) -> int: + level = 0 + while True: + c = read_byte(self) + if c == '\0': + # End of file. Do not validate that indentation is a + # multiple of 4 spaces. Add a trailing newline implicitly + # if needed. + # + # TODO: test this + return 0 + elif c == '\n': + level = 0 + elif c == '#': + consume_rest_of_line(self) + level = 0 + elif c == ' ': + level++ + else: + unread_byte(self, c) + return level + def read_token(self: Tokenizer*) -> Token: while True: token = Token{location = self->location} b = read_byte(self) - if is_identifier_or_number_byte(b): - token.short_string = read_identifier_or_number(self, b) + if b == ' ': + continue + if b == '\n': + if self->parens_len > 0: + continue + token.kind = TokenKind::Newline + token.indentation_level = read_newline_token(self) + elif is_identifier_or_number_byte(b): token.kind = TokenKind::Name + token.short_string = read_identifier_or_number(self, b) else: + printf("TODO '%c'\n", b) + # TODO token.kind = TokenKind::EndOfFile return token @@ -164,16 +206,7 @@ def tokenize_without_indent_dedent_tokens(file: FILE*, path: byte*) -> Token*: len = 0 while len == 0 or tokens[len-1].kind != TokenKind::EndOfFile: tokens = realloc(tokens, sizeof(tokens[0]) * (len+1)) - printf("Size of token = %lld\n", sizeof(tokens[0])) - printf("Just realloced to %lld\n", sizeof(tokens[0])* (len+1)) - printf("base=%p dest=%p\n", tokens, &tokens[len]) - # FIXME: can't do tokens[len++] - t = read_token(&tokenizer) - printf("Asd\n") - printf("base=%p dest=%p\n", tokens, &tokens[len]) - tokens[len] = t - printf("base=%p dest=%p\n", tokens, &tokens[len]) - len++ + tokens[len++] = read_token(&tokenizer) free(tokenizer.pushback) return tokens @@ -190,7 +223,7 @@ def tokenize(path: byte*) -> Token*: def main() -> int: - tokens = tokenize("examples/hello.jou") + tokens = tokenize("../examples/hello.jou") t = tokens while True: From 5d2e515db5a045fc9d4caafe2f9e881f56074a14 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 19:17:50 +0200 Subject: [PATCH 03/15] self-hosted tokenizer: strings --- self_hosted/tokenizer.jou | 57 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou index 7a6dff38..407ea95e 100644 --- a/self_hosted/tokenizer.jou +++ b/self_hosted/tokenizer.jou @@ -1,4 +1,5 @@ from "stdlib/io.jou" import printf, FILE, fgetc, ferror, fopen +from "stdlib/str.jou" import sprintf from "stdlib/mem.jou" import malloc, realloc, free from "./errors_and_warnings.jou" import Location, fail, assert @@ -52,6 +53,8 @@ def print_token(token: Token*) -> void: printf("name \"%s\"\n", &token->short_string[0]) elif token->kind == TokenKind::Newline: printf("newline, next indent %d\n", token->indentation_level) + elif token->kind == TokenKind::String: + printf("string \"%s\"\n", token->long_string) else: printf("????\n") @@ -165,18 +168,70 @@ def read_newline_token(self: Tokenizer*) -> int: unread_byte(self, c) return level +def read_string(self: Tokenizer*) -> byte*: + result: byte* = NULL + len = 0 + + while True: + c = read_byte(self) + if c == '"': + break + elif c == '\n' or c == '\0': + if c == '\n': + self->location.lineno-- + fail(self->location, "missing \" to end the string") + elif c == '\n': + # \n means newline, for example + after_backslash = read_byte(self) + if after_backslash == '\0': + fail(self->location, "missing \" to end the string") + elif after_backslash == '\n': + result = realloc(result, len+1) + result[len++] = '\n' + elif after_backslash == 'r': + result = realloc(result, len+1) + result[len++] = '\r' + elif after_backslash == '\\' or after_backslash == '"': + result = realloc(result, len+1) + result[len++] = after_backslash + elif after_backslash == '0': + fail(self->location, "strings cannot contain zero bytes (\\0), because that is the special end marker byte") + elif '0' <= after_backslash and after_backslash <= '9': + result = realloc(result, len+1) + result[len++] = after_backslash - '0' + elif after_backslash == '\n': + # \ at end of line, string continues on next line + len = len # TODO: pass statement + else: + if after_backslash < 0x80 and isprint(after_backslash) != 0: + message: byte* = malloc(100) + sprintf(message, "unknown escape: '\\%c'", after_backslash) + fail(self->location, message) + else: + fail(self->location, "unknown '\\' escape") + else: + result = realloc(result, len+1) + result[len++] = c + + result = realloc(result, len+1) + result[len] = '\0' + return result + def read_token(self: Tokenizer*) -> Token: while True: token = Token{location = self->location} b = read_byte(self) - if b == ' ': continue + if b == '\n': if self->parens_len > 0: continue token.kind = TokenKind::Newline token.indentation_level = read_newline_token(self) + elif b == '"': + token.kind = TokenKind::String + token.long_string = read_string(self) elif is_identifier_or_number_byte(b): token.kind = TokenKind::Name token.short_string = read_identifier_or_number(self, b) From ebe2a3f8eb99a4a7af6b80a49f738c88fec679c3 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 19:46:48 +0200 Subject: [PATCH 04/15] hello world tokenizes :) --- self_hosted/tokenizer.jou | 96 +++++++++++++++++++++++++++++++++++++-- stdlib/mem.jou | 1 + stdlib/str.jou | 9 ++++ 3 files changed, 101 insertions(+), 5 deletions(-) diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou index 407ea95e..66ceb6f3 100644 --- a/self_hosted/tokenizer.jou +++ b/self_hosted/tokenizer.jou @@ -1,6 +1,6 @@ from "stdlib/io.jou" import printf, FILE, fgetc, ferror, fopen -from "stdlib/str.jou" import sprintf -from "stdlib/mem.jou" import malloc, realloc, free +from "stdlib/str.jou" import sprintf, strlen, strchr, strcmp +from "stdlib/mem.jou" import malloc, realloc, free, memset from "./errors_and_warnings.jou" import Location, fail, assert enum TokenKind: @@ -49,6 +49,8 @@ def print_token(token: Token*) -> void: printf("\n") elif token->kind == TokenKind::EndOfFile: printf("end of file\n") + elif token->kind == TokenKind::Operator: + printf("operator '%s'\n", &token->short_string[0]) elif token->kind == TokenKind::Name: printf("name \"%s\"\n", &token->short_string[0]) elif token->kind == TokenKind::Newline: @@ -217,6 +219,84 @@ def read_string(self: Tokenizer*) -> byte*: result[len] = '\0' return result +def is_operator_byte(c: byte) -> bool: + return c != '\0' and strchr("=<>!.,()[]{};:+-*/&%", c) != NULL + +declare strncmp(s1: byte*, s2: byte*, n: long) -> int + +def starts_with(s: byte*, prefix: byte*) -> bool: + return strncmp(s, prefix, strlen(prefix)) == 0 + +def read_operator(self: Tokenizer*) -> byte[100]: + # TODO: nicer array syntax + operators: byte*[100] + i = 0 + # Longer operators first, so that '==' does not parse as '=' '=' + operators[i++] = "..." + operators[i++] = "===" + operators[i++] = "!==" + operators[i++] = "==" + operators[i++] = "!=" + operators[i++] = "->" + operators[i++] = "<=" + operators[i++] = ">=" + operators[i++] = "++" + operators[i++] = "--" + operators[i++] = "+=" + operators[i++] = "-=" + operators[i++] = "*=" + operators[i++] = "/=" + operators[i++] = "%=" + operators[i++] = "::" + operators[i++] = "." + operators[i++] = "," + operators[i++] = ":" + operators[i++] = ";" + operators[i++] = "=" + operators[i++] = "(" + operators[i++] = ")" + operators[i++] = "{" + operators[i++] = "}" + operators[i++] = "[" + operators[i++] = "]" + operators[i++] = "&" + operators[i++] = "%" + operators[i++] = "*" + operators[i++] = "/" + operators[i++] = "+" + operators[i++] = "-" + operators[i++] = "<" + operators[i++] = ">" + operators[i] = NULL + + operator: byte[100] + memset(&operator, 0, sizeof operator) + + # Read as many operator characters as we may need. + while strlen(&operator[0]) < 3: + c = read_byte(self) + if not is_operator_byte(c): + unread_byte(self, c) + break + operator[strlen(&operator[0])] = c + + for op = &operators[0]; *op != NULL; op++: + if starts_with(&operator[0], *op): + # Unread the bytes we didn't use. + while strlen(&operator[0]) > strlen(*op): + last = &operator[strlen(&operator[0]) - 1] + unread_byte(self, *last) + *last = '\0' + + # "===" and "!==" are here only to give a better error message to javascript people. + if strcmp(&operator[0], "===") != 0 and strcmp(&operator[0], "!==") != 0: + return operator + + message: byte[100] + sprintf(&message[0], "there is no '%s' operator", &operator[0]) + fail(self->location, &message[0]) + return operator # TODO: never actually runs, but causes a compiler warning + def read_token(self: Tokenizer*) -> Token: while True: token = Token{location = self->location} @@ -235,10 +315,16 @@ def read_token(self: Tokenizer*) -> Token: elif is_identifier_or_number_byte(b): token.kind = TokenKind::Name token.short_string = read_identifier_or_number(self, b) - else: - printf("TODO '%c'\n", b) - # TODO + elif is_operator_byte(b): + unread_byte(self, b) + token.kind = TokenKind::Operator + token.short_string = read_operator(self) + elif b == '\0': token.kind = TokenKind::EndOfFile + else: + message: byte[100] + sprintf(&message[0], "unexpected byte %#02x", b) + fail(self->location, &message[0]) return token def tokenize_without_indent_dedent_tokens(file: FILE*, path: byte*) -> Token*: diff --git a/stdlib/mem.jou b/stdlib/mem.jou index e3095506..bd905113 100644 --- a/stdlib/mem.jou +++ b/stdlib/mem.jou @@ -6,4 +6,5 @@ declare malloc(size: long) -> void* declare realloc(ptr: void*, size: long) -> void* declare free(ptr: void*) -> void +declare memset(dest: void*, fill_byte: int, count: long) -> void* declare memcpy(dest: void*, source: void*, count: long) -> void* diff --git a/stdlib/str.jou b/stdlib/str.jou index bdbe565d..0fb56698 100644 --- a/stdlib/str.jou +++ b/stdlib/str.jou @@ -11,3 +11,12 @@ declare snprintf(dest: byte*, n: long, pattern: byte*, ...) -> int # Find a substring. Return a pointer to the occurrence in haystack, or NULL if not found. declare strstr(haystack: byte*, needle: byte*) -> byte* + +# Similar to strstr(), but searches for a single byte rather than a substring. +declare strchr(haystack: byte*, needle: byte) -> byte* + +# Calculate the length of a string in bytes. Note that strlen("รถ") == 2, for example. +declare strlen(s: byte*) -> long + +# Compare the strings. Return 0 for equal, or nonzero for not equal. +declare strcmp(s1: byte*, s2: byte*) -> int From 0195a9bc6e1777fdeea85bffc89b75574671b390 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 21:08:45 +0200 Subject: [PATCH 05/15] Simplify how tokenizing works at end of file --- src/tokenize.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/tokenize.c b/src/tokenize.c index 506aabb2..67cb2f84 100644 --- a/src/tokenize.c +++ b/src/tokenize.c @@ -126,7 +126,7 @@ static void read_indentation_as_newline_token(struct State *st, Token *t) else if (c == '\0') { // Ignore newline+spaces at end of file. Do not validate 4 spaces. // TODO: test case - t->type = TOKEN_END_OF_FILE; + t->data.indentation_level = 0; return; } else { unread_byte(st, c); @@ -480,10 +480,6 @@ static Token *handle_indentations(const Token *temp_tokens) do{ if (t->type == TOKEN_END_OF_FILE) { - // Add an extra newline token at end of file and the dedents after it. - // This makes it similar to how other newline and dedent tokens work: - // the dedents always come after a newline token. - Append(&tokens, (Token){ .location=t->location, .type=TOKEN_NEWLINE }); while(level) { Append(&tokens, (Token){ .location=t->location, .type=TOKEN_DEDENT }); level -= 4; From 06fa4099f593b03413418fd88d195af1a9026c37 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 21:46:01 +0200 Subject: [PATCH 06/15] Make sure hello world tokenizes in exactly same way with both tokenizers. --- self_hosted/tokenizer.jou | 138 +++++++++++++++++++++++++++++++++++--- src/print.c | 4 +- stdlib/mem.jou | 1 + 3 files changed, 130 insertions(+), 13 deletions(-) diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou index 66ceb6f3..72523298 100644 --- a/self_hosted/tokenizer.jou +++ b/self_hosted/tokenizer.jou @@ -1,6 +1,6 @@ from "stdlib/io.jou" import printf, FILE, fgetc, ferror, fopen from "stdlib/str.jou" import sprintf, strlen, strchr, strcmp -from "stdlib/mem.jou" import malloc, realloc, free, memset +from "stdlib/mem.jou" import malloc, realloc, free, memset, memmove from "./errors_and_warnings.jou" import Location, fail, assert enum TokenKind: @@ -53,10 +53,16 @@ def print_token(token: Token*) -> void: printf("operator '%s'\n", &token->short_string[0]) elif token->kind == TokenKind::Name: printf("name \"%s\"\n", &token->short_string[0]) + elif token->kind == TokenKind::Keyword: + printf("keyword \"%s\"\n", &token->short_string[0]) elif token->kind == TokenKind::Newline: - printf("newline, next indent %d\n", token->indentation_level) + printf("newline token (next line has %d spaces of indentation)\n", token->indentation_level) elif token->kind == TokenKind::String: printf("string \"%s\"\n", token->long_string) + elif token->kind == TokenKind::Indent: + printf("indent (+4 spaces)\n") + elif token->kind == TokenKind::Dedent: + printf("dedent (-4 spaces)\n") else: printf("????\n") @@ -297,6 +303,49 @@ def read_operator(self: Tokenizer*) -> byte[100]: fail(self->location, &message[0]) return operator # TODO: never actually runs, but causes a compiler warning +def is_keyword(word: byte*) -> bool: + # TODO: better array syntax + keywords: byte*[100] + i = 0 + keywords[i++] = "from" + keywords[i++] = "import" + keywords[i++] = "def" + keywords[i++] = "declare" + keywords[i++] = "struct" + keywords[i++] = "enum" + keywords[i++] = "global" + keywords[i++] = "return" + keywords[i++] = "if" + keywords[i++] = "elif" + keywords[i++] = "else" + keywords[i++] = "while" + keywords[i++] = "for" + keywords[i++] = "break" + keywords[i++] = "continue" + keywords[i++] = "True" + keywords[i++] = "False" + keywords[i++] = "NULL" + keywords[i++] = "and" + keywords[i++] = "or" + keywords[i++] = "not" + keywords[i++] = "as" + keywords[i++] = "sizeof" + keywords[i++] = "void" + keywords[i++] = "bool" + keywords[i++] = "byte" + keywords[i++] = "int" + keywords[i++] = "long" + keywords[i++] = "float" + keywords[i++] = "double" + keywords[i++] = NULL + + for kw = &keywords[0]; *kw != NULL; kw++: + if strcmp(*kw, word) == 0: + return True + return False + +declare atoi(s: byte*) -> int + def read_token(self: Tokenizer*) -> Token: while True: token = Token{location = self->location} @@ -313,8 +362,15 @@ def read_token(self: Tokenizer*) -> Token: token.kind = TokenKind::String token.long_string = read_string(self) elif is_identifier_or_number_byte(b): - token.kind = TokenKind::Name token.short_string = read_identifier_or_number(self, b) + if is_keyword(&token.short_string[0]): + token.kind = TokenKind::Keyword + elif '0' <= token.short_string[0] and token.short_string[0] <= '9': + # TODO: support various other things + token.kind = TokenKind::Int + token.int_value = atoi(&token.short_string[0]) + else: + token.kind = TokenKind::Name elif is_operator_byte(b): unread_byte(self, b) token.kind = TokenKind::Operator @@ -352,26 +408,86 @@ def tokenize_without_indent_dedent_tokens(file: FILE*, path: byte*) -> Token*: free(tokenizer.pushback) return tokens +# Creates a new array of tokens with indent/dedent tokens added after +# newline tokens that change the indentation level. +def handle_indentations(raw_tokens: Token*) -> Token*: + tokens: Token* = NULL + ntokens = 0 + level = 0 + + for t = raw_tokens; True; t++: + if t->kind == TokenKind::EndOfFile: + # Add an extra newline token at end of file and the dedents after it. + # This makes it similar to how other newline and dedent tokens work: + # the dedents always come after a newline token. + tokens = realloc(tokens, sizeof tokens[0] * (ntokens + level/4 + 1)) + while level != 0: + tokens[ntokens++] = Token{location = t->location, kind = TokenKind::Dedent} + level -= 4 + tokens[ntokens++] = *t + break + + tokens = realloc(tokens, sizeof tokens[0] * (ntokens+1)) + tokens[ntokens++] = *t + + if t->kind == TokenKind::Newline: + after_newline = t->location + after_newline.lineno++ + + if t->indentation_level % 4 != 0: + fail(after_newline, "indentation must be a multiple of 4 spaces") + + while level < t->indentation_level: + tokens = realloc(tokens, sizeof tokens[0] * (ntokens+1)) + tokens[ntokens++] = Token{location = after_newline, kind = TokenKind::Indent} + level += 4 + + while level > t->indentation_level: + tokens = realloc(tokens, sizeof tokens[0] * (ntokens+1)) + tokens[ntokens++] = Token{location = after_newline, kind = TokenKind::Dedent} + level -= 4 + + # Delete the newline token in the beginning. + # + # If the file has indentations after it, they are now represented by separate + # indent tokens and parsing will fail. If the file doesn't have any blank/comment + # lines in the beginning, it has a newline token anyway to avoid special casing. + assert(tokens[0].kind == TokenKind::Newline) + memmove(&tokens[0], &tokens[1], sizeof tokens[0] * (ntokens - 1)) + + return tokens + def tokenize(path: byte*) -> Token*: file = fopen(path, "rb") if file == NULL: # TODO: test this # TODO: include errno in the message fail(Location{path=path}, "cannot open file") - raw_tokens = tokenize_without_indent_dedent_tokens(file, path) - # TODO: handle indentations and such - return raw_tokens + raw_tokens = tokenize_without_indent_dedent_tokens(file, path) + better_tokens = handle_indentations(raw_tokens) + free(raw_tokens) + return better_tokens -def main() -> int: - tokens = tokenize("../examples/hello.jou") - +def print_tokens(tokens: Token*) -> void: + printf("===== Tokens for file \"%s\" =====\n", tokens->location.path) t = tokens + current_lineno = -1 + while True: + if t->location.lineno != current_lineno: + current_lineno = t->location.lineno + printf("\nLine %d:\n", current_lineno) + + printf(" ") print_token(t) - # TODO: Shouldn't need parentheses. - if (t++)->kind == TokenKind::EndOfFile: + + if t->kind == TokenKind::EndOfFile: break + t++ +def main() -> int: + tokens = tokenize("../examples/hello.jou") + print_tokens(tokens) free(tokens) return 0 diff --git a/src/print.c b/src/print.c index 215fca53..4588c080 100644 --- a/src/print.c +++ b/src/print.c @@ -93,10 +93,10 @@ void print_token(const Token *token) printf("end of file\n"); break; case TOKEN_INDENT: - printf("more indentation (+4 spaces)\n"); + printf("indent (+4 spaces)\n"); break; case TOKEN_DEDENT: - printf("less indentation (-4 spaces)\n"); + printf("dedent (-4 spaces)\n"); break; case TOKEN_OPERATOR: printf("operator '%s'\n", token->data.operator); diff --git a/stdlib/mem.jou b/stdlib/mem.jou index bd905113..b71d01e8 100644 --- a/stdlib/mem.jou +++ b/stdlib/mem.jou @@ -8,3 +8,4 @@ declare free(ptr: void*) -> void declare memset(dest: void*, fill_byte: int, count: long) -> void* declare memcpy(dest: void*, source: void*, count: long) -> void* +declare memmove(dest: void*, source: void*, count: long) -> void* From 598cde2bf7474494da97c6f5e6f97a0d526cfe53 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 22:52:56 +0200 Subject: [PATCH 07/15] Add tests for the self-hosted tokenizer --- .github/workflows/linux.yml | 36 +++-- .github/workflows/windows.yml | 14 ++ self_hosted/tokenizer.jou | 7 +- self_hosted/tokenizes_wrong.txt | 186 ++++++++++++++++++++++++++ src/jou_compiler.h | 1 + src/main.c | 13 +- tests/should_succeed/compiler_cli.jou | 1 + tokenizers.sh | 49 +++++++ 8 files changed, 290 insertions(+), 17 deletions(-) create mode 100644 self_hosted/tokenizes_wrong.txt create mode 100755 tokenizers.sh diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 9486d398..73d4b72f 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -13,17 +13,25 @@ jobs: # Testing all levels because there was a bug that only happened with -O1. (#224) opt-level: ['-O0', '-O1', '-O2', '-O3'] steps: - - uses: actions/checkout@v3 - - run: sudo apt install -y llvm-${{ matrix.llvm-version }}-dev clang-${{ matrix.llvm-version }} make valgrind - - run: LLVM_CONFIG=llvm-config-${{ matrix.llvm-version }} make - - run: ./runtests.sh --verbose './jou ${{ matrix.opt-level }} %s' - - run: ./runtests.sh --verbose './jou ${{ matrix.opt-level }} --verbose %s' - - run: ./runtests.sh --verbose --valgrind './jou ${{ matrix.opt-level }} %s' - # valgrind+verbose isn't meaningful: test script would ignore valgrind output - - run: make clean - - name: Check that "make clean" deleted all files not committed to Git - run: | - if [ "$(git status --porcelain --ignored)" != "" ]; then - git status --ignored - exit 1 - fi + - uses: actions/checkout@v3 + - run: sudo apt install -y llvm-${{ matrix.llvm-version }}-dev clang-${{ matrix.llvm-version }} make valgrind + - run: LLVM_CONFIG=llvm-config-${{ matrix.llvm-version }} make + - run: ./runtests.sh --verbose './jou ${{ matrix.opt-level }} %s' + - run: ./runtests.sh --verbose './jou ${{ matrix.opt-level }} --verbose %s' + - run: ./runtests.sh --verbose --valgrind './jou ${{ matrix.opt-level }} %s' + # valgrind+verbose isn't meaningful: test script would ignore valgrind output + - run: make clean + - name: Check that "make clean" deleted all files not committed to Git + run: | + if [ "$(git status --porcelain --ignored)" != "" ]; then + git status --ignored + exit 1 + fi + + tokenizers: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - run: sudo apt install -y llvm-13-dev clang-13 make valgrind + - run: LLVM_CONFIG=llvm-config-13 make + - run: ./tokenizers.sh diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 6fcad65a..ffd9e3ce 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -131,3 +131,17 @@ jobs: shell: bash - run: cd "test dir" && ./runtests.sh --verbose shell: bash + + tokenizers: + needs: build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/download-artifact@v3 + with: + name: windows-zip + - run: unzip jou.zip + - run: mv jou/* . + shell: bash + - run: ./tokenizers.sh + shell: bash diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou index 72523298..94cfc3a0 100644 --- a/self_hosted/tokenizer.jou +++ b/self_hosted/tokenizer.jou @@ -486,8 +486,11 @@ def print_tokens(tokens: Token*) -> void: break t++ -def main() -> int: - tokens = tokenize("../examples/hello.jou") + printf("\n") + +def main(argc: int, argv: byte**) -> int: + assert(argc == 2) + tokens = tokenize(argv[1]) print_tokens(tokens) free(tokens) return 0 diff --git a/self_hosted/tokenizes_wrong.txt b/self_hosted/tokenizes_wrong.txt new file mode 100644 index 00000000..19b9e63b --- /dev/null +++ b/self_hosted/tokenizes_wrong.txt @@ -0,0 +1,186 @@ +# This is a list of files that are not yet supported by the self-hosted compiler. +examples/fib.jou +examples/x11_window.jou +tests/syntax_error/import_after_def.jou +tests/syntax_error/missing_return_type.jou +tests/syntax_error/missing_arg_type.jou +tests/syntax_error/declare_global_with_value.jou +tests/syntax_error/arg_after_dotdotdot.jou +tests/syntax_error/chained_eq.jou +tests/syntax_error/bad_type.jou +tests/syntax_error/missing_import_keyword.jou +tests/syntax_error/missing_second_equal_sign.jou +tests/syntax_error/hex.jou +tests/syntax_error/double_assignment.jou +tests/syntax_error/double_with_letters_after.jou +tests/syntax_error/dot_after_e.jou +tests/syntax_error/unnecessary_zero.jou +tests/syntax_error/bad_addressof.jou +tests/syntax_error/bin.jou +tests/syntax_error/arg_default.jou +tests/syntax_error/def_missing_args.jou +tests/syntax_error/bad_expression.jou +tests/syntax_error/missing_field_names.jou +tests/syntax_error/bad_field.jou +tests/syntax_error/string_zero_byte.jou +tests/syntax_error/triple_equals.jou +tests/syntax_error/import_missing_quotes.jou +tests/syntax_error/unknown_escape_ascii.jou +tests/syntax_error/bad_struct_field_name.jou +tests/syntax_error/python_style_for.jou +tests/syntax_error/import1.jou +tests/syntax_error/and_or_chaining.jou +tests/syntax_error/bad_function_name_after_def.jou +tests/syntax_error/empty_char.jou +tests/syntax_error/import_missing_comma_with_parens.jou +tests/syntax_error/2bad.jou +tests/syntax_error/array_size.jou +tests/syntax_error/multidot_float.jou +tests/syntax_error/0b2.jou +tests/syntax_error/ee.jou +tests/syntax_error/overlong_char.jou +tests/syntax_error/dotdotdot_dotdotdot.jou +tests/syntax_error/bad_byte.jou +tests/syntax_error/first_line_indent.jou +tests/syntax_error/too_many_closing_parens.jou +tests/syntax_error/indentation_not4.jou +tests/syntax_error/import_missing_dot.jou +tests/syntax_error/unknown_escape_multibyte.jou +tests/syntax_error/infinite_c_style_for.jou +tests/syntax_error/struct_missing_type.jou +tests/syntax_error/bad_toplevel_declaration.jou +tests/syntax_error/missing_indentation.jou +tests/syntax_error/import_missing_comma.jou +tests/syntax_error/bad_argument_name.jou +tests/syntax_error/mismatched_close_brace.jou +tests/syntax_error/bad_statement.jou +tests/syntax_error/indexing.jou +tests/syntax_error/struct_init_js_syntax.jou +tests/syntax_error/missing_colon.jou +tests/syntax_error/missing_number_after_eminus.jou +tests/syntax_error/bad_struct_name.jou +tests/syntax_error/missing_number_after_e.jou +tests/syntax_error/chained_le.jou +tests/syntax_error/float.jou +tests/syntax_error/double_not.jou +tests/syntax_error/too_many_opening_parens.jou +tests/syntax_error/struct_default.jou +tests/crash/null_deref.jou +tests/wrong_type/assign_void.jou +tests/wrong_type/arg.jou +tests/wrong_type/assign_with_type.jou +tests/wrong_type/while.jou +tests/wrong_type/var_assignment.jou +tests/wrong_type/cannot_be_indexed.jou +tests/wrong_type/float_and_double.jou +tests/wrong_type/struct_member_assign.jou +tests/wrong_type/deref_non_pointer.jou +tests/wrong_type/elif.jou +tests/wrong_type/int_to_enum.jou +tests/wrong_type/arrow_operator_not_struct.jou +tests/wrong_type/enum_member_from_struct.jou +tests/wrong_type/enum_to_int.jou +tests/wrong_type/assign_to_deref_non_pointer.jou +tests/wrong_type/index.jou +tests/wrong_type/dot_operator.jou +tests/wrong_type/neg.jou +tests/wrong_type/array_to_ptr.jou +tests/wrong_type/arg_with_varargs.jou +tests/wrong_type/inplace_add_doesnt_go_back.jou +tests/wrong_type/mod.jou +tests/wrong_type/pointer_assignment.jou +tests/wrong_type/for.jou +tests/wrong_type/bool_main.jou +tests/wrong_type/plusplus.jou +tests/wrong_type/not.jou +tests/wrong_type/brace_init_arg.jou +tests/wrong_type/return_value.jou +tests/wrong_type/if.jou +tests/wrong_type/struct_member_init.jou +tests/wrong_type/void_main.jou +tests/wrong_type/array_vararg.jou +tests/wrong_type/pointer_eq.jou +tests/wrong_type/or.jou +tests/wrong_type/arrow_operator_not_pointer.jou +tests/should_succeed/assign.jou +tests/should_succeed/enum.jou +tests/should_succeed/string_syntax.jou +tests/should_succeed/octalnuber.jou +tests/should_succeed/sizeof.jou +tests/should_succeed/global_bug.jou +tests/should_succeed/crlf.jou +tests/should_succeed/loops.jou +tests/should_succeed/add_sub_mul_div_mod.jou +tests/should_succeed/printf.jou +tests/should_succeed/undefined_value_warning.jou +tests/should_succeed/global.jou +tests/should_succeed/pointer.jou +tests/should_succeed/mathlibtest.jou +tests/should_succeed/sscanf.jou +tests/should_succeed/plusplus_minusminus.jou +tests/should_succeed/stderr.jou +tests/should_succeed/return_void.jou +tests/should_succeed/unreachable_warning.jou +tests/should_succeed/local_import.jou +tests/should_succeed/array.jou +tests/should_succeed/compare.jou +tests/should_succeed/and_or_not.jou +tests/should_succeed/struct.jou +tests/should_succeed/argument.jou +tests/should_succeed/compiler_cli.jou +tests/should_succeed/file.jou +tests/should_succeed/expfloat.jou +tests/should_succeed/implicit_conversions.jou +tests/should_succeed/return_string.jou +tests/should_succeed/as.jou +tests/should_succeed/if_elif_else.jou +tests/should_succeed/unused_import.jou +tests/other_errors/missing_return.jou +tests/other_errors/brace_init_dupe.jou +tests/other_errors/double_plusplus.jou +tests/other_errors/array0.jou +tests/other_errors/address_of_minusminus.jou +tests/other_errors/function_wrong_n_args.jou +tests/other_errors/duplicate_enum_member.jou +tests/other_errors/continue_outside_loop.jou +tests/other_errors/varargs_def.jou +tests/other_errors/runtime_return_1.jou +tests/other_errors/dumb_assignment.jou +tests/other_errors/dynamic_array_length.jou +tests/other_errors/var_shadow.jou +tests/other_errors/address_of_array_indexing.jou +tests/other_errors/redefine_imported_func.jou +tests/other_errors/struct_already_exists.jou +tests/other_errors/imported_error.jou +tests/other_errors/break_outside_loop.jou +tests/other_errors/void_as_type.jou +tests/other_errors/dumb_assignment_with_plusequals.jou +tests/other_errors/using_void_function.jou +tests/other_errors/immediate_member_assign.jou +tests/other_errors/unexpected_return_value.jou +tests/other_errors/duplicate_arg_name.jou +tests/other_errors/missing_value_in_return.jou +tests/other_errors/duplicate_field_name.jou +tests/404/enum.jou +tests/404/function.jou +tests/404/var.jou +tests/404/type.jou +tests/404/import_wrong_func.jou +tests/404/struct_field.jou +tests/404/enum_member.jou +tests/404/import_symbol.jou +tests/404/var_addressof.jou +tests/404/file.jou +tests/404/import_symbol_multiline.jou +tests/already_exists_error/global_var.jou +tests/already_exists_error/struct_import.jou +tests/already_exists_error/global_var_import.jou +tests/already_exists_error/struct.jou +tests/already_exists_error/func.jou +tests/already_exists_error/local_var.jou +tests/already_exists_error/func_import.jou +tests/already_exists_error/struct_and_enum.jou +tests/too_long/long.jou +tests/too_long/nested_parentheses.jou +tests/too_long/name.jou +tests/too_long/int.jou diff --git a/src/jou_compiler.h b/src/jou_compiler.h index 69f99104..a7b93f5a 100644 --- a/src/jou_compiler.h +++ b/src/jou_compiler.h @@ -49,6 +49,7 @@ typedef struct CfInstruction CfInstruction; struct CommandLineFlags { bool verbose; // Whether to print a LOT of debug info + bool tokenize_only; int optlevel; // Optimization level (0 don't optimize, 3 optimize a lot) const char *outfile; // If not NULL, where to output executable const char *linker_flags; // String that is appended to linking command diff --git a/src/main.c b/src/main.c index 1950d5f6..fb459fa7 100644 --- a/src/main.c +++ b/src/main.c @@ -42,6 +42,7 @@ static const char help_fmt[] = " -o OUTFILE output an executable file, don't run the code\n" " -O0/-O1/-O2/-O3 set optimization level (0 = default, 3 = runs fastest)\n" " --verbose display a lot of information about all compilation steps\n" + " --tokenize-only display only the output of the tokenizer, and don't run other compile steps\n" " --linker-flags appended to the linker command, so you can use external libraries\n" ; @@ -76,6 +77,13 @@ static void parse_arguments(int argc, char **argv, CommandLineFlags *flags, cons } else if (!strcmp(argv[i], "--verbose")) { flags->verbose = true; i++; + } else if (!strcmp(argv[i], "--tokenize-only")) { + if (argc > 3) { + fprintf(stderr, "%s: --tokenize-only cannot be used together with other flags", argv[0]); + goto wrong_usage; + } + flags->tokenize_only = true; + i++; } else if (!strcmp(argv[i], "--linker-flags")) { if (flags->linker_flags) { fprintf(stderr, "%s: --linker-flags cannot be given multiple times", argv[0]); @@ -173,8 +181,11 @@ static void parse_file(struct CompileState *compst, const char *filename, const } Token *tokens = tokenize(f, fs.path); fclose(f); - if(compst->flags.verbose) + + if(compst->flags.verbose || compst->flags.tokenize_only) print_tokens(tokens); + if (compst->flags.tokenize_only) + exit(0); fs.ast = parse(tokens, compst->stdlib_path); free_tokens(tokens); diff --git a/tests/should_succeed/compiler_cli.jou b/tests/should_succeed/compiler_cli.jou index 1dfe419b..1a0b97c5 100644 --- a/tests/should_succeed/compiler_cli.jou +++ b/tests/should_succeed/compiler_cli.jou @@ -30,6 +30,7 @@ def main() -> int: run_jou("lolwat.jou") # Output: compiler error in file "lolwat.jou": cannot open file: No such file or directory run_jou("--linker-flags") # Output: : there must be a string of flags after --linker-flags (try " --help") run_jou("--linker-flags x --linker-flags y") # Output: : --linker-flags cannot be given multiple times (try " --help") + run_jou("--tokenize-only -O1 examples/hello.jou") # Output: : --tokenize-only cannot be used together with other flags (try " --help") # Output: Usage: # Output: [-o OUTFILE] [-O0|-O1|-O2|-O3] [--verbose] [--linker-flags "..."] FILENAME diff --git a/tokenizers.sh b/tokenizers.sh new file mode 100755 index 00000000..46fdbe02 --- /dev/null +++ b/tokenizers.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# +# There are two Jou compilers: one written in C and another written in Jou. +# They should be able to tokenize each Jou file in exactly the same way. +# If tokenizing a Jou file fails, both tokenizers should fail with the same error message. + +if [[ "$OS" =~ Windows ]]; then + dotexe=.exe +else + dotexe= +fi + +set -e + +rm -rf tmp/tokenizers +mkdir -v tmp/tokenizers + +echo "Compiling the self-hosted compiler..." +./jou${dotexe} -O1 -o tmp/tokenizers/self_hosted${dotexe} self_hosted/tokenizer.jou + +for file in $(find examples tests -name '*.jou'); do + echo $file + (./jou${dotexe} --tokenize-only $file || true) &> tmp/tokenizers/compiler_written_in_c.txt + (tmp/tokenizers/self_hosted${dotexe} $file || true) &> tmp/tokenizers/self_hosted.txt + + if grep -qxF $file self_hosted/tokenizes_wrong.txt; then + # The file is skipped, so the two compilers should behave differently + if diff tmp/tokenizers/compiler_written_in_c.txt tmp/tokenizers/self_hosted.txt >/dev/null; then + echo " Error: Tokenizers behave the same even though the file is listed in self_hosted/tokenizes_wrong.txt." + echo " To fix this error, delete the \"$file\" line from self_hosted/tokenizes_wrong.txt." + exit 1 + else + echo " Tokenizers behave differently as expected (listed in self_hosted_skip.txt)" + fi + else + if diff -u --color=always tmp/tokenizers/compiler_written_in_c.txt tmp/tokenizers/self_hosted.txt; then + echo " Tokenizers behave the same as expected" + else + echo " Error: Tokenizers behave differently when given \"$file\"." + echo " You can silence this error by adding \"$file\" to tmp/tokenizers/self_hosted.txt." + echo " Ideally the tokenizers would behave in the same way for all files, but we aren't there yet." + exit 1 + fi + fi +done + +echo "" +echo "" +echo "success :)" From 6bd21dba704d97c6eb9aa113bac7515006bc74f6 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 23:03:18 +0200 Subject: [PATCH 08/15] fix test --- src/main.c | 2 +- tests/should_succeed/compiler_cli.jou | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main.c b/src/main.c index fb459fa7..8241fc0a 100644 --- a/src/main.c +++ b/src/main.c @@ -42,7 +42,7 @@ static const char help_fmt[] = " -o OUTFILE output an executable file, don't run the code\n" " -O0/-O1/-O2/-O3 set optimization level (0 = default, 3 = runs fastest)\n" " --verbose display a lot of information about all compilation steps\n" - " --tokenize-only display only the output of the tokenizer, and don't run other compile steps\n" + " --tokenize-only display only the output of the tokenizer, don't do anything else\n" " --linker-flags appended to the linker command, so you can use external libraries\n" ; diff --git a/tests/should_succeed/compiler_cli.jou b/tests/should_succeed/compiler_cli.jou index 1a0b97c5..328908af 100644 --- a/tests/should_succeed/compiler_cli.jou +++ b/tests/should_succeed/compiler_cli.jou @@ -41,6 +41,7 @@ def main() -> int: # Output: -o OUTFILE output an executable file, don't run the code # Output: -O0/-O1/-O2/-O3 set optimization level (0 = default, 3 = runs fastest) # Output: --verbose display a lot of information about all compilation steps + # Output: --tokenize-only display only the output of the tokenizer, don't do anything else # Output: --linker-flags appended to the linker command, so you can use external libraries run_jou("--help") From 70f0c4a362fac5d9bc6aed3c005e77caa5f26c92 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 23:03:42 +0200 Subject: [PATCH 09/15] fix script --- tokenizers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers.sh b/tokenizers.sh index 46fdbe02..7b68999e 100755 --- a/tokenizers.sh +++ b/tokenizers.sh @@ -13,7 +13,7 @@ fi set -e rm -rf tmp/tokenizers -mkdir -v tmp/tokenizers +mkdir -vp tmp/tokenizers echo "Compiling the self-hosted compiler..." ./jou${dotexe} -O1 -o tmp/tokenizers/self_hosted${dotexe} self_hosted/tokenizer.jou From 4998a7506e1c1732b5a0b52aa26acd0166cf8c61 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 23:05:25 +0200 Subject: [PATCH 10/15] fix echo --- tokenizers.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tokenizers.sh b/tokenizers.sh index 7b68999e..86b14f5d 100755 --- a/tokenizers.sh +++ b/tokenizers.sh @@ -30,7 +30,7 @@ for file in $(find examples tests -name '*.jou'); do echo " To fix this error, delete the \"$file\" line from self_hosted/tokenizes_wrong.txt." exit 1 else - echo " Tokenizers behave differently as expected (listed in self_hosted_skip.txt)" + echo " Tokenizers behave differently as expected (listed in self_hosted/tokenizes_wrong.txt)" fi else if diff -u --color=always tmp/tokenizers/compiler_written_in_c.txt tmp/tokenizers/self_hosted.txt; then From 34249b5e3a0589188adaf1cb439c7fd2d6961eef Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 23:12:21 +0200 Subject: [PATCH 11/15] Lets try this --- .github/workflows/windows.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index ffd9e3ce..6de12afa 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -141,7 +141,7 @@ jobs: with: name: windows-zip - run: unzip jou.zip - - run: mv jou/* . + - run: mv tokenizers.sh jou shell: bash - - run: ./tokenizers.sh + - run: (cd jou && ./tokenizers.sh) shell: bash From eebdaa6ba3efe3c6ca26d9733e21e060f43868fb Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 23:14:50 +0200 Subject: [PATCH 12/15] lol --- .github/workflows/windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 6de12afa..401e6505 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -134,7 +134,7 @@ jobs: tokenizers: needs: build - runs-on: ubuntu-latest + runs-on: windows-latest steps: - uses: actions/checkout@v3 - uses: actions/download-artifact@v3 From 2e301ef83ff35020ac432c853c9a0d423b9cff14 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 23:32:36 +0200 Subject: [PATCH 13/15] Apply suggestions from code review --- self_hosted/tokenizer.jou | 6 ++++-- self_hosted/tokenizes_wrong.txt | 2 +- src/jou_compiler.h | 2 +- stdlib/io.jou | 2 +- stdlib/mem.jou | 1 + 5 files changed, 8 insertions(+), 5 deletions(-) diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou index 94cfc3a0..bfca58ad 100644 --- a/self_hosted/tokenizer.jou +++ b/self_hosted/tokenizer.jou @@ -31,6 +31,7 @@ struct Token: short_string: byte[100] # Name, Keyword, Operator long_string: byte* # String +# TODO: import this (#227 maybe?) declare isprint(b: int) -> int def print_token(token: Token*) -> void: @@ -130,8 +131,7 @@ def is_identifier_or_number_byte(b: byte) -> bool: def read_identifier_or_number(self: Tokenizer*, first_byte: byte) -> byte[100]: dest: byte[100] - for i = 0; i < 100; i++: # TODO: memset - dest[i] = '\0' + memset(&dest, 0, sizeof dest) destlen = 0 assert(is_identifier_or_number_byte(first_byte)) @@ -230,6 +230,7 @@ def is_operator_byte(c: byte) -> bool: declare strncmp(s1: byte*, s2: byte*, n: long) -> int +# TODO: move to stdlib def starts_with(s: byte*, prefix: byte*) -> bool: return strncmp(s, prefix, strlen(prefix)) == 0 @@ -344,6 +345,7 @@ def is_keyword(word: byte*) -> bool: return True return False +# TODO: move to stdlib declare atoi(s: byte*) -> int def read_token(self: Tokenizer*) -> Token: diff --git a/self_hosted/tokenizes_wrong.txt b/self_hosted/tokenizes_wrong.txt index 19b9e63b..b54eac84 100644 --- a/self_hosted/tokenizes_wrong.txt +++ b/self_hosted/tokenizes_wrong.txt @@ -1,4 +1,4 @@ -# This is a list of files that are not yet supported by the self-hosted compiler. +# This is a list of files that are not yet supported by the tokenizer of the self-hosted compiler. examples/fib.jou examples/x11_window.jou tests/syntax_error/import_after_def.jou diff --git a/src/jou_compiler.h b/src/jou_compiler.h index a7b93f5a..bbdadb15 100644 --- a/src/jou_compiler.h +++ b/src/jou_compiler.h @@ -49,7 +49,7 @@ typedef struct CfInstruction CfInstruction; struct CommandLineFlags { bool verbose; // Whether to print a LOT of debug info - bool tokenize_only; + bool tokenize_only; // If true, tokenize the file passed on command line and don't actually compile anything int optlevel; // Optimization level (0 don't optimize, 3 optimize a lot) const char *outfile; // If not NULL, where to output executable const char *linker_flags; // String that is appended to linking command diff --git a/stdlib/io.jou b/stdlib/io.jou index e0bfeb96..842e972b 100644 --- a/stdlib/io.jou +++ b/stdlib/io.jou @@ -61,7 +61,7 @@ declare fgetc(file: FILE*) -> int # see getchar() declare fscanf(file: FILE*, pattern: byte*, ...) -> int # Ensure that output is actually written. It may remain buffered -# without calling this function. +# if this function isn't called. declare fflush(file: FILE*) -> int # Read a line of text from file into a string starting at the given diff --git a/stdlib/mem.jou b/stdlib/mem.jou index b71d01e8..bf303cfc 100644 --- a/stdlib/mem.jou +++ b/stdlib/mem.jou @@ -6,6 +6,7 @@ declare malloc(size: long) -> void* declare realloc(ptr: void*, size: long) -> void* declare free(ptr: void*) -> void +# TODO: explain what each of these does declare memset(dest: void*, fill_byte: int, count: long) -> void* declare memcpy(dest: void*, source: void*, count: long) -> void* declare memmove(dest: void*, source: void*, count: long) -> void* From 334403d3994fb93c870849d86156e53a125dd900 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 23:32:40 +0200 Subject: [PATCH 14/15] Fixening --- .github/workflows/windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 401e6505..c4a5966d 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -141,7 +141,7 @@ jobs: with: name: windows-zip - run: unzip jou.zip - - run: mv tokenizers.sh jou + - run: mv tokenizers.sh self_hosted jou shell: bash - run: (cd jou && ./tokenizers.sh) shell: bash From e233ca067d025677d011acb298ea7ac30faa0500 Mon Sep 17 00:00:00 2001 From: Akuli Date: Fri, 24 Feb 2023 23:46:15 +0200 Subject: [PATCH 15/15] cleanup/fix --- src/main.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/src/main.c b/src/main.c index 8241fc0a..d5e385da 100644 --- a/src/main.c +++ b/src/main.c @@ -165,6 +165,18 @@ static struct FileState *find_file(const struct CompileState *compst, const char return NULL; } +static FILE *open_the_file(const char *path, const Location *import_location) +{ + FILE *f = fopen(path, "rb"); + if (!f) { + if (import_location) + fail_with_error(*import_location, "cannot import from \"%s\": %s", path, strerror(errno)); + else + fail_with_error((Location){.filename=path}, "cannot open file: %s", strerror(errno)); + } + return f; +} + static void parse_file(struct CompileState *compst, const char *filename, const Location *import_location) { if (find_file(compst, filename)) @@ -172,20 +184,12 @@ static void parse_file(struct CompileState *compst, const char *filename, const struct FileState fs = { .path = strdup(filename) }; - FILE *f = fopen(fs.path, "rb"); - if (!f) { - if (import_location) - fail_with_error(*import_location, "cannot import from \"%s\": %s", filename, strerror(errno)); - else - fail_with_error((Location){.filename=filename}, "cannot open file: %s", strerror(errno)); - } + FILE *f = open_the_file(fs.path, import_location); Token *tokens = tokenize(f, fs.path); fclose(f); - if(compst->flags.verbose || compst->flags.tokenize_only) + if(compst->flags.verbose) print_tokens(tokens); - if (compst->flags.tokenize_only) - exit(0); fs.ast = parse(tokens, compst->stdlib_path); free_tokens(tokens); @@ -405,6 +409,15 @@ int main(int argc, char **argv) printf("Data layout: %s\n", get_target()->data_layout); } + if (compst.flags.tokenize_only) { + FILE *f = open_the_file(filename, NULL); + Token *tokens = tokenize(f, filename); + fclose(f); + print_tokens(tokens); + free_tokens(tokens); + return 0; + } + #ifdef _WIN32 char *startup_path = malloc(strlen(compst.stdlib_path) + 50); sprintf(startup_path, "%s/_windows_startup.jou", compst.stdlib_path);