From 01f427bc0351323fe9007ef62f185872d3311b52 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 17 Feb 2023 22:23:50 +0200
Subject: [PATCH 01/15] Start working on self-hosted compiler. Hitting bugs in
 existing compiler...

---
 self_hosted/errors_and_warnings.jou |  25 ++++
 self_hosted/tokenizer.jou           | 203 ++++++++++++++++++++++++++++
 stdlib/io.jou                       |   8 ++
 stdlib/mem.jou                      |   1 +
 4 files changed, 237 insertions(+)
 create mode 100644 self_hosted/errors_and_warnings.jou
 create mode 100644 self_hosted/tokenizer.jou

diff --git a/self_hosted/errors_and_warnings.jou b/self_hosted/errors_and_warnings.jou
new file mode 100644
index 00000000..663461d9
--- /dev/null
+++ b/self_hosted/errors_and_warnings.jou
@@ -0,0 +1,25 @@
+from "stdlib/process.jou" import exit
+from "stdlib/io.jou" import stdout, stderr, fprintf, fflush
+
+struct Location:
+    path: byte*  # Not owned. Points to a string that is held elsewhere.
+    lineno: int
+
+def fail(location: Location, message: byte*) -> void:
+    # When stdout is redirected to same place as stderr,
+    # make sure that normal printf()s show up before our error.
+    fflush(stdout)
+    fflush(stderr)
+
+    fprintf(stderr, "compiler error in file \"%s\"", location.path)
+    if location.lineno != 0:
+        fprintf(stderr, ", line %d", location.lineno)
+    fprintf(stderr, ": %s\n", message)
+
+    exit(1)
+
+# TODO: doesn't really belong here
+def assert(b: bool) -> void:
+    if not b:
+        fprintf(stderr, "assertion failed\n")
+        exit(1)
diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou
new file mode 100644
index 00000000..548cc697
--- /dev/null
+++ b/self_hosted/tokenizer.jou
@@ -0,0 +1,203 @@
+from "stdlib/io.jou" import printf, FILE, fgetc, ferror, fopen
+from "stdlib/mem.jou" import malloc, realloc, free
+from "./errors_and_warnings.jou" import Location, fail, assert
+
+enum TokenKind:
+    Int
+    Long
+    Float
+    Double
+    Byte    # example: 'a' is 97 as a byte
+    String
+    Name
+    Keyword
+    Newline
+    Indent
+    Dedent
+    Operator
+    EndOfFile  # Marks the end of an array of tokens.
+
+struct Token:
+    kind: TokenKind
+    location: Location
+
+    # Only one of these is used at a time.
+    # TODO: union
+    int_value: int          # Int
+    long_value: long        # Long
+    byte_value: byte        # Byte
+    indentation_level: int  # Newline (indicates how many spaces there are after the newline)
+    short_string: byte[100] # Name, Keyword, Operator
+    long_string: byte*      # String
+
+declare isprint(b: int) -> int
+
+def print_token(token: Token*) -> void:
+    if token->kind == TokenKind::Int:
+        printf("integer %d\n", token->int_value)
+    elif token->kind == TokenKind::Long:
+        printf("long %lld\n", token->long_value)
+    elif token->kind == TokenKind::Float:
+        printf("float %s\n", &token->short_string[0])
+    elif token->kind == TokenKind::Double:
+        printf("double %s\n", &token->short_string[0])
+    elif token->kind == TokenKind::Byte:
+        printf("character %#02x", token->byte_value)
+        if isprint(token->byte_value) != 0:
+            printf(" '%c'", token->byte_value)
+        printf("\n")
+    elif token->kind == TokenKind::EndOfFile:
+        printf("end of file\n")
+    else:
+        printf("????\n")
+
+struct Tokenizer:
+    f: FILE*
+    location: Location
+    pushback: byte*
+    pushback_len: int  # TODO: dynamic array
+    # Parens array isn't dynamic, so that you can't segfault
+    # the compiler by feeding it lots of nested parentheses,
+    # which would make it recurse too deep.
+    parens: Token[50]
+    parens_len: int
+
+def read_byte(self: Tokenizer*) -> byte:
+    EOF = -1  # FIXME
+
+    c: byte
+    if self->pushback_len > 0:
+        c = self->pushback[--self->pushback_len]
+    else:
+        temp = fgetc(self->f)
+        if temp == '\r':
+            # On Windows, \r just before \n is ignored.
+            temp = fgetc(self->f)
+            if temp != EOF and temp != '\n':
+                # TODO: test this, if possible?
+                fail(self->location, "source file contains a CR byte ('\\r') that isn't a part of a CRLF line ending")
+
+        if temp == EOF:
+            if ferror(self->f) != 0:
+                # TODO: include errno in the error message
+                fail(self->location, "cannot read file")
+            # Use the zero byte to denote end of file.
+            c = '\0'
+        elif temp == '\0':
+            # TODO: test this
+            fail(self->location, "source file contains a zero byte")
+            c = 'x'  # TODO: silences compiler warning, but never runs
+        else:
+            c = temp as byte
+
+    if c == '\n':
+        self->location.lineno++
+    return c
+
+
+def unread_byte(self: Tokenizer*, b: byte) -> void:
+    if b == '\0':
+        return
+
+    assert(b != '\r')
+    self->pushback = realloc(self->pushback, self->pushback_len + 1)
+    self->pushback[self->pushback_len++] = b
+    if b == '\n':
+        self->location.lineno--
+
+def is_identifier_or_number_byte(b: byte) -> bool:
+    return (
+        ('A' <= b and b <= 'Z')
+        or ('a' <= b and b <= 'z')
+        or ('0' <= b and b <= '9')
+        or b == '_'
+    )
+
+def read_identifier_or_number(self: Tokenizer*, first_byte: byte) -> byte[100]:
+    dest: byte[100]
+    for i = 0; i < 100; i++:  # TODO: memset
+        dest[i] = '\0'
+    destlen = 0
+
+    assert(is_identifier_or_number_byte(first_byte))
+    dest[destlen++] = first_byte
+
+    while True:
+        b = read_byte(self)
+        if is_identifier_or_number_byte(b):
+            if destlen == sizeof dest - 1:
+                fail(self->location, "name or number is too long")
+            dest[destlen++] = b
+        else:
+            unread_byte(self, b)
+            return dest
+
+def read_token(self: Tokenizer*) -> Token:
+    while True:
+        token = Token{location = self->location}
+        b = read_byte(self)
+
+        if is_identifier_or_number_byte(b):
+            token.short_string = read_identifier_or_number(self, b)
+            token.kind = TokenKind::Name
+        else:
+            token.kind = TokenKind::EndOfFile
+        return token
+
+def tokenize_without_indent_dedent_tokens(file: FILE*, path: byte*) -> Token*:
+    tokenizer = Tokenizer{
+        location = Location{path = path},
+        f = file,
+    }
+
+    # Add a fake newline to the beginning. It does a few things:
+    #  * Less special-casing: blank lines in the beginning of the file can
+    #    cause there to be a newline token anyway.
+    #  * It is easier to detect an unexpected indentation in the beginning
+    #    of the file, as it becomes just like any other indentation.
+    #  * Line numbers start at 1.
+    tokenizer.pushback = malloc(1)
+    tokenizer.pushback[0] = '\n'
+    tokenizer.pushback_len = 1
+
+    tokens: Token* = NULL
+    len = 0
+    while len == 0 or tokens[len-1].kind != TokenKind::EndOfFile:
+        tokens = realloc(tokens, sizeof(tokens[0]) * (len+1))
+        printf("Size of token = %lld\n", sizeof(tokens[0]))
+        printf("Just realloced to %lld\n", sizeof(tokens[0])* (len+1))
+        printf("base=%p dest=%p\n", tokens, &tokens[len])
+        # FIXME: can't do tokens[len++]
+        t = read_token(&tokenizer)
+        printf("Asd\n")
+        printf("base=%p dest=%p\n", tokens, &tokens[len])
+        tokens[len] = t
+        printf("base=%p dest=%p\n", tokens, &tokens[len])
+        len++
+
+    free(tokenizer.pushback)
+    return tokens
+
+def tokenize(path: byte*) -> Token*:
+    file = fopen(path, "rb")
+    if file == NULL:
+        # TODO: test this
+        # TODO: include errno in the message
+        fail(Location{path=path}, "cannot open file")
+    raw_tokens = tokenize_without_indent_dedent_tokens(file, path)
+    # TODO: handle indentations and such
+    return raw_tokens
+
+
+def main() -> int:
+    tokens = tokenize("examples/hello.jou")
+
+    t = tokens
+    while True:
+        print_token(t)
+        # TODO: Shouldn't need parentheses.
+        if (t++)->kind == TokenKind::EndOfFile:
+            break
+
+    free(tokens)
+    return 0
diff --git a/stdlib/io.jou b/stdlib/io.jou
index 5a13461f..e0bfeb96 100644
--- a/stdlib/io.jou
+++ b/stdlib/io.jou
@@ -60,6 +60,10 @@ declare fprintf(file: FILE *, pattern: byte*, ...) -> int
 declare fgetc(file: FILE*) -> int  # see getchar()
 declare fscanf(file: FILE*, pattern: byte*, ...) -> int
 
+# Ensure that output is actually written. It may remain buffered
+# without calling this function.
+declare fflush(file: FILE*) -> int
+
 # Read a line of text from file into a string starting at the given
 # pointer. Reading stops at newline character, end of file, on error,
 # or when the resulting string (including the '\0') wouldn't fit
@@ -68,5 +72,9 @@ declare fscanf(file: FILE*, pattern: byte*, ...) -> int
 # Return value: NULL on error, same as destination on success.
 declare fgets(destination: byte*, n: int, file: FILE*) -> byte*
 
+# TODO: document
+declare feof(file: FILE*) -> int
+declare ferror(file: FILE*) -> int
+
 # Move back to beginning of file.
 declare rewind(file: FILE*) -> void
diff --git a/stdlib/mem.jou b/stdlib/mem.jou
index 94250f58..e3095506 100644
--- a/stdlib/mem.jou
+++ b/stdlib/mem.jou
@@ -3,6 +3,7 @@
 # Heap allocations
 # TODO: write a tutorial about using these and add a link
 declare malloc(size: long) -> void*
+declare realloc(ptr: void*, size: long) -> void*
 declare free(ptr: void*) -> void
 
 declare memcpy(dest: void*, source: void*, count: long) -> void*

From fcf5f72283df4acd5861aa6ae12020b88880cc7b Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 19:02:11 +0200
Subject: [PATCH 02/15] More working on the self-hosted tokenizer

---
 self_hosted/tokenizer.jou | 59 ++++++++++++++++++++++++++++++---------
 1 file changed, 46 insertions(+), 13 deletions(-)

diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou
index 548cc697..7a6dff38 100644
--- a/self_hosted/tokenizer.jou
+++ b/self_hosted/tokenizer.jou
@@ -48,6 +48,10 @@ def print_token(token: Token*) -> void:
         printf("\n")
     elif token->kind == TokenKind::EndOfFile:
         printf("end of file\n")
+    elif token->kind == TokenKind::Name:
+        printf("name \"%s\"\n", &token->short_string[0])
+    elif token->kind == TokenKind::Newline:
+        printf("newline, next indent %d\n", token->indentation_level)
     else:
         printf("????\n")
 
@@ -132,15 +136,53 @@ def read_identifier_or_number(self: Tokenizer*, first_byte: byte) -> byte[100]:
             unread_byte(self, b)
             return dest
 
+def consume_rest_of_line(self: Tokenizer*) -> void:
+    while True:
+        c = read_byte(self)
+        if c == '\0' or c == '\n':
+            break
+
+# Returns the indentation level for the next line
+def read_newline_token(self: Tokenizer*) -> int:
+    level = 0
+    while True:
+        c = read_byte(self)
+        if c == '\0':
+            # End of file. Do not validate that indentation is a
+            # multiple of 4 spaces. Add a trailing newline implicitly
+            # if needed.
+            #
+            # TODO: test this
+            return 0
+        elif c == '\n':
+            level = 0
+        elif c == '#':
+            consume_rest_of_line(self)
+            level = 0
+        elif c == ' ':
+            level++
+        else:
+            unread_byte(self, c)
+            return level
+
 def read_token(self: Tokenizer*) -> Token:
     while True:
         token = Token{location = self->location}
         b = read_byte(self)
 
-        if is_identifier_or_number_byte(b):
-            token.short_string = read_identifier_or_number(self, b)
+        if b == ' ':
+            continue
+        if b == '\n':
+            if self->parens_len > 0:
+                continue
+            token.kind = TokenKind::Newline
+            token.indentation_level = read_newline_token(self)
+        elif is_identifier_or_number_byte(b):
             token.kind = TokenKind::Name
+            token.short_string = read_identifier_or_number(self, b)
         else:
+            printf("TODO '%c'\n", b)
+            # TODO
             token.kind = TokenKind::EndOfFile
         return token
 
@@ -164,16 +206,7 @@ def tokenize_without_indent_dedent_tokens(file: FILE*, path: byte*) -> Token*:
     len = 0
     while len == 0 or tokens[len-1].kind != TokenKind::EndOfFile:
         tokens = realloc(tokens, sizeof(tokens[0]) * (len+1))
-        printf("Size of token = %lld\n", sizeof(tokens[0]))
-        printf("Just realloced to %lld\n", sizeof(tokens[0])* (len+1))
-        printf("base=%p dest=%p\n", tokens, &tokens[len])
-        # FIXME: can't do tokens[len++]
-        t = read_token(&tokenizer)
-        printf("Asd\n")
-        printf("base=%p dest=%p\n", tokens, &tokens[len])
-        tokens[len] = t
-        printf("base=%p dest=%p\n", tokens, &tokens[len])
-        len++
+        tokens[len++] = read_token(&tokenizer)
 
     free(tokenizer.pushback)
     return tokens
@@ -190,7 +223,7 @@ def tokenize(path: byte*) -> Token*:
 
 
 def main() -> int:
-    tokens = tokenize("examples/hello.jou")
+    tokens = tokenize("../examples/hello.jou")
 
     t = tokens
     while True:

From 5d2e515db5a045fc9d4caafe2f9e881f56074a14 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 19:17:50 +0200
Subject: [PATCH 03/15] self-hosted tokenizer: strings

---
 self_hosted/tokenizer.jou | 57 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 56 insertions(+), 1 deletion(-)

diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou
index 7a6dff38..407ea95e 100644
--- a/self_hosted/tokenizer.jou
+++ b/self_hosted/tokenizer.jou
@@ -1,4 +1,5 @@
 from "stdlib/io.jou" import printf, FILE, fgetc, ferror, fopen
+from "stdlib/str.jou" import sprintf
 from "stdlib/mem.jou" import malloc, realloc, free
 from "./errors_and_warnings.jou" import Location, fail, assert
 
@@ -52,6 +53,8 @@ def print_token(token: Token*) -> void:
         printf("name \"%s\"\n", &token->short_string[0])
     elif token->kind == TokenKind::Newline:
         printf("newline, next indent %d\n", token->indentation_level)
+    elif token->kind == TokenKind::String:
+        printf("string \"%s\"\n", token->long_string)
     else:
         printf("????\n")
 
@@ -165,18 +168,70 @@ def read_newline_token(self: Tokenizer*) -> int:
             unread_byte(self, c)
             return level
 
+def read_string(self: Tokenizer*) -> byte*:
+    result: byte* = NULL
+    len = 0
+
+    while True:
+        c = read_byte(self)
+        if c == '"':
+            break
+        elif c == '\n' or c == '\0':
+            if c == '\n':
+                self->location.lineno--
+            fail(self->location, "missing \" to end the string")
+        elif c == '\n':
+            # \n means newline, for example
+            after_backslash = read_byte(self)
+            if after_backslash == '\0':
+                fail(self->location, "missing \" to end the string")
+            elif after_backslash == '\n':
+                result = realloc(result, len+1)
+                result[len++] = '\n'
+            elif after_backslash == 'r':
+                result = realloc(result, len+1)
+                result[len++] = '\r'
+            elif after_backslash == '\\' or after_backslash == '"':
+                result = realloc(result, len+1)
+                result[len++] = after_backslash
+            elif after_backslash == '0':
+                fail(self->location, "strings cannot contain zero bytes (\\0), because that is the special end marker byte")
+            elif '0' <= after_backslash and after_backslash <= '9':
+                result = realloc(result, len+1)
+                result[len++] = after_backslash - '0'
+            elif after_backslash == '\n':
+                # \ at end of line, string continues on next line
+                len = len  # TODO: pass statement
+            else:
+                if after_backslash < 0x80 and isprint(after_backslash) != 0:
+                    message: byte* = malloc(100)
+                    sprintf(message, "unknown escape: '\\%c'", after_backslash)
+                    fail(self->location, message)
+                else:
+                    fail(self->location, "unknown '\\' escape")
+        else:
+            result = realloc(result, len+1)
+            result[len++] = c
+
+    result = realloc(result, len+1)
+    result[len] = '\0'
+    return result
+
 def read_token(self: Tokenizer*) -> Token:
     while True:
         token = Token{location = self->location}
         b = read_byte(self)
-
         if b == ' ':
             continue
+
         if b == '\n':
             if self->parens_len > 0:
                 continue
             token.kind = TokenKind::Newline
             token.indentation_level = read_newline_token(self)
+        elif b == '"':
+            token.kind = TokenKind::String
+            token.long_string = read_string(self)
         elif is_identifier_or_number_byte(b):
             token.kind = TokenKind::Name
             token.short_string = read_identifier_or_number(self, b)

From ebe2a3f8eb99a4a7af6b80a49f738c88fec679c3 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 19:46:48 +0200
Subject: [PATCH 04/15] hello world tokenizes :)

---
 self_hosted/tokenizer.jou | 96 +++++++++++++++++++++++++++++++++++++--
 stdlib/mem.jou            |  1 +
 stdlib/str.jou            |  9 ++++
 3 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou
index 407ea95e..66ceb6f3 100644
--- a/self_hosted/tokenizer.jou
+++ b/self_hosted/tokenizer.jou
@@ -1,6 +1,6 @@
 from "stdlib/io.jou" import printf, FILE, fgetc, ferror, fopen
-from "stdlib/str.jou" import sprintf
-from "stdlib/mem.jou" import malloc, realloc, free
+from "stdlib/str.jou" import sprintf, strlen, strchr, strcmp
+from "stdlib/mem.jou" import malloc, realloc, free, memset
 from "./errors_and_warnings.jou" import Location, fail, assert
 
 enum TokenKind:
@@ -49,6 +49,8 @@ def print_token(token: Token*) -> void:
         printf("\n")
     elif token->kind == TokenKind::EndOfFile:
         printf("end of file\n")
+    elif token->kind == TokenKind::Operator:
+        printf("operator '%s'\n", &token->short_string[0])
     elif token->kind == TokenKind::Name:
         printf("name \"%s\"\n", &token->short_string[0])
     elif token->kind == TokenKind::Newline:
@@ -217,6 +219,84 @@ def read_string(self: Tokenizer*) -> byte*:
     result[len] = '\0'
     return result
 
+def is_operator_byte(c: byte) -> bool:
+    return c != '\0' and strchr("=<>!.,()[]{};:+-*/&%", c) != NULL
+
+declare strncmp(s1: byte*, s2: byte*, n: long) -> int
+
+def starts_with(s: byte*, prefix: byte*) -> bool:
+    return strncmp(s, prefix, strlen(prefix)) == 0
+
+def read_operator(self: Tokenizer*) -> byte[100]:
+    # TODO: nicer array syntax
+    operators: byte*[100]
+    i = 0
+    # Longer operators first, so that '==' does not parse as '=' '='
+    operators[i++] = "..."
+    operators[i++] = "==="
+    operators[i++] = "!=="
+    operators[i++] = "=="
+    operators[i++] = "!="
+    operators[i++] = "->"
+    operators[i++] = "<="
+    operators[i++] = ">="
+    operators[i++] = "++"
+    operators[i++] = "--"
+    operators[i++] = "+="
+    operators[i++] = "-="
+    operators[i++] = "*="
+    operators[i++] = "/="
+    operators[i++] = "%="
+    operators[i++] = "::"
+    operators[i++] = "."
+    operators[i++] = ","
+    operators[i++] = ":"
+    operators[i++] = ";"
+    operators[i++] = "="
+    operators[i++] = "("
+    operators[i++] = ")"
+    operators[i++] = "{"
+    operators[i++] = "}"
+    operators[i++] = "["
+    operators[i++] = "]"
+    operators[i++] = "&"
+    operators[i++] = "%"
+    operators[i++] = "*"
+    operators[i++] = "/"
+    operators[i++] = "+"
+    operators[i++] = "-"
+    operators[i++] = "<"
+    operators[i++] = ">"
+    operators[i] = NULL
+
+    operator: byte[100]
+    memset(&operator, 0, sizeof operator)
+
+    # Read as many operator characters as we may need.
+    while strlen(&operator[0]) < 3:
+        c = read_byte(self)
+        if not is_operator_byte(c):
+            unread_byte(self, c)
+            break
+        operator[strlen(&operator[0])] = c
+
+    for op = &operators[0]; *op != NULL; op++:
+        if starts_with(&operator[0], *op):
+            # Unread the bytes we didn't use.
+            while strlen(&operator[0]) > strlen(*op):
+                last = &operator[strlen(&operator[0]) - 1]
+                unread_byte(self, *last)
+                *last = '\0'
+
+            # "===" and "!==" are here only to give a better error message to javascript people.
+            if strcmp(&operator[0], "===") != 0 and strcmp(&operator[0], "!==") != 0:
+                return operator
+
+    message: byte[100]
+    sprintf(&message[0], "there is no '%s' operator", &operator[0])
+    fail(self->location, &message[0])
+    return operator  # TODO: never actually runs, but causes a compiler warning
+
 def read_token(self: Tokenizer*) -> Token:
     while True:
         token = Token{location = self->location}
@@ -235,10 +315,16 @@ def read_token(self: Tokenizer*) -> Token:
         elif is_identifier_or_number_byte(b):
             token.kind = TokenKind::Name
             token.short_string = read_identifier_or_number(self, b)
-        else:
-            printf("TODO '%c'\n", b)
-            # TODO
+        elif is_operator_byte(b):
+            unread_byte(self, b)
+            token.kind = TokenKind::Operator
+            token.short_string = read_operator(self)
+        elif b == '\0':
             token.kind = TokenKind::EndOfFile
+        else:
+            message: byte[100]
+            sprintf(&message[0], "unexpected byte %#02x", b)
+            fail(self->location, &message[0])
         return token
 
 def tokenize_without_indent_dedent_tokens(file: FILE*, path: byte*) -> Token*:
diff --git a/stdlib/mem.jou b/stdlib/mem.jou
index e3095506..bd905113 100644
--- a/stdlib/mem.jou
+++ b/stdlib/mem.jou
@@ -6,4 +6,5 @@ declare malloc(size: long) -> void*
 declare realloc(ptr: void*, size: long) -> void*
 declare free(ptr: void*) -> void
 
+declare memset(dest: void*, fill_byte: int, count: long) -> void*
 declare memcpy(dest: void*, source: void*, count: long) -> void*
diff --git a/stdlib/str.jou b/stdlib/str.jou
index bdbe565d..0fb56698 100644
--- a/stdlib/str.jou
+++ b/stdlib/str.jou
@@ -11,3 +11,12 @@ declare snprintf(dest: byte*, n: long, pattern: byte*, ...) -> int
 
 # Find a substring. Return a pointer to the occurrence in haystack, or NULL if not found.
 declare strstr(haystack: byte*, needle: byte*) -> byte*
+
+# Similar to strstr(), but searches for a single byte rather than a substring.
+declare strchr(haystack: byte*, needle: byte) -> byte*
+
+# Calculate the length of a string in bytes. Note that strlen("ö") == 2, for example.
+declare strlen(s: byte*) -> long
+
+# Compare the strings. Return 0 for equal, or nonzero for not equal.
+declare strcmp(s1: byte*, s2: byte*) -> int

From 0195a9bc6e1777fdeea85bffc89b75574671b390 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 21:08:45 +0200
Subject: [PATCH 05/15] Simplify how tokenizing works at end of file

---
 src/tokenize.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/tokenize.c b/src/tokenize.c
index 506aabb2..67cb2f84 100644
--- a/src/tokenize.c
+++ b/src/tokenize.c
@@ -126,7 +126,7 @@ static void read_indentation_as_newline_token(struct State *st, Token *t)
         else if (c == '\0') {
             // Ignore newline+spaces at end of file. Do not validate 4 spaces.
             // TODO: test case
-            t->type = TOKEN_END_OF_FILE;
+            t->data.indentation_level = 0;
             return;
         } else {
             unread_byte(st, c);
@@ -480,10 +480,6 @@ static Token *handle_indentations(const Token *temp_tokens)
 
     do{
         if (t->type == TOKEN_END_OF_FILE) {
-            // Add an extra newline token at end of file and the dedents after it.
-            // This makes it similar to how other newline and dedent tokens work:
-            // the dedents always come after a newline token.
-            Append(&tokens, (Token){ .location=t->location, .type=TOKEN_NEWLINE });
             while(level) {
                 Append(&tokens, (Token){ .location=t->location, .type=TOKEN_DEDENT });
                 level -= 4;

From 06fa4099f593b03413418fd88d195af1a9026c37 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 21:46:01 +0200
Subject: [PATCH 06/15] Make sure hello world tokenizes in exactly same way
 with both tokenizers.

---
 self_hosted/tokenizer.jou | 138 +++++++++++++++++++++++++++++++++++---
 src/print.c               |   4 +-
 stdlib/mem.jou            |   1 +
 3 files changed, 130 insertions(+), 13 deletions(-)

diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou
index 66ceb6f3..72523298 100644
--- a/self_hosted/tokenizer.jou
+++ b/self_hosted/tokenizer.jou
@@ -1,6 +1,6 @@
 from "stdlib/io.jou" import printf, FILE, fgetc, ferror, fopen
 from "stdlib/str.jou" import sprintf, strlen, strchr, strcmp
-from "stdlib/mem.jou" import malloc, realloc, free, memset
+from "stdlib/mem.jou" import malloc, realloc, free, memset, memmove
 from "./errors_and_warnings.jou" import Location, fail, assert
 
 enum TokenKind:
@@ -53,10 +53,16 @@ def print_token(token: Token*) -> void:
         printf("operator '%s'\n", &token->short_string[0])
     elif token->kind == TokenKind::Name:
         printf("name \"%s\"\n", &token->short_string[0])
+    elif token->kind == TokenKind::Keyword:
+        printf("keyword \"%s\"\n", &token->short_string[0])
     elif token->kind == TokenKind::Newline:
-        printf("newline, next indent %d\n", token->indentation_level)
+        printf("newline token (next line has %d spaces of indentation)\n", token->indentation_level)
     elif token->kind == TokenKind::String:
         printf("string \"%s\"\n", token->long_string)
+    elif token->kind == TokenKind::Indent:
+        printf("indent (+4 spaces)\n")
+    elif token->kind == TokenKind::Dedent:
+        printf("dedent (-4 spaces)\n")
     else:
         printf("????\n")
 
@@ -297,6 +303,49 @@ def read_operator(self: Tokenizer*) -> byte[100]:
     fail(self->location, &message[0])
     return operator  # TODO: never actually runs, but causes a compiler warning
 
+def is_keyword(word: byte*) -> bool:
+    # TODO: better array syntax
+    keywords: byte*[100]
+    i = 0
+    keywords[i++] = "from"
+    keywords[i++] = "import"
+    keywords[i++] = "def"
+    keywords[i++] = "declare"
+    keywords[i++] = "struct"
+    keywords[i++] = "enum"
+    keywords[i++] = "global"
+    keywords[i++] = "return"
+    keywords[i++] = "if"
+    keywords[i++] = "elif"
+    keywords[i++] = "else"
+    keywords[i++] = "while"
+    keywords[i++] = "for"
+    keywords[i++] = "break"
+    keywords[i++] = "continue"
+    keywords[i++] = "True"
+    keywords[i++] = "False"
+    keywords[i++] = "NULL"
+    keywords[i++] = "and"
+    keywords[i++] = "or"
+    keywords[i++] = "not"
+    keywords[i++] = "as"
+    keywords[i++] = "sizeof"
+    keywords[i++] = "void"
+    keywords[i++] = "bool"
+    keywords[i++] = "byte"
+    keywords[i++] = "int"
+    keywords[i++] = "long"
+    keywords[i++] = "float"
+    keywords[i++] = "double"
+    keywords[i++] = NULL
+
+    for kw = &keywords[0]; *kw != NULL; kw++:
+        if strcmp(*kw, word) == 0:
+            return True
+    return False
+
+declare atoi(s: byte*) -> int
+
 def read_token(self: Tokenizer*) -> Token:
     while True:
         token = Token{location = self->location}
@@ -313,8 +362,15 @@ def read_token(self: Tokenizer*) -> Token:
             token.kind = TokenKind::String
             token.long_string = read_string(self)
         elif is_identifier_or_number_byte(b):
-            token.kind = TokenKind::Name
             token.short_string = read_identifier_or_number(self, b)
+            if is_keyword(&token.short_string[0]):
+                token.kind = TokenKind::Keyword
+            elif '0' <= token.short_string[0] and token.short_string[0] <= '9':
+                # TODO: support various other things
+                token.kind = TokenKind::Int
+                token.int_value = atoi(&token.short_string[0])
+            else:
+                token.kind = TokenKind::Name
         elif is_operator_byte(b):
             unread_byte(self, b)
             token.kind = TokenKind::Operator
@@ -352,26 +408,86 @@ def tokenize_without_indent_dedent_tokens(file: FILE*, path: byte*) -> Token*:
     free(tokenizer.pushback)
     return tokens
 
+# Creates a new array of tokens with indent/dedent tokens added after
+# newline tokens that change the indentation level.
+def handle_indentations(raw_tokens: Token*) -> Token*:
+    tokens: Token* = NULL
+    ntokens = 0
+    level = 0
+
+    for t = raw_tokens; True; t++:
+        if t->kind == TokenKind::EndOfFile:
+            # Add an extra newline token at end of file and the dedents after it.
+            # This makes it similar to how other newline and dedent tokens work:
+            # the dedents always come after a newline token.
+            tokens = realloc(tokens, sizeof tokens[0] * (ntokens + level/4 + 1))
+            while level != 0:
+                tokens[ntokens++] = Token{location = t->location, kind = TokenKind::Dedent}
+                level -= 4
+            tokens[ntokens++] = *t
+            break
+
+        tokens = realloc(tokens, sizeof tokens[0] * (ntokens+1))
+        tokens[ntokens++] = *t
+
+        if t->kind == TokenKind::Newline:
+            after_newline = t->location
+            after_newline.lineno++
+
+            if t->indentation_level % 4 != 0:
+                fail(after_newline, "indentation must be a multiple of 4 spaces")
+
+            while level < t->indentation_level:
+                tokens = realloc(tokens, sizeof tokens[0] * (ntokens+1))
+                tokens[ntokens++] = Token{location = after_newline, kind = TokenKind::Indent}
+                level += 4
+
+            while level > t->indentation_level:
+                tokens = realloc(tokens, sizeof tokens[0] * (ntokens+1))
+                tokens[ntokens++] = Token{location = after_newline, kind = TokenKind::Dedent}
+                level -= 4
+
+    # Delete the newline token in the beginning.
+    #
+    # If the file has indentations after it, they are now represented by separate
+    # indent tokens and parsing will fail. If the file doesn't have any blank/comment
+    # lines in the beginning, it has a newline token anyway to avoid special casing.
+    assert(tokens[0].kind == TokenKind::Newline)
+    memmove(&tokens[0], &tokens[1], sizeof tokens[0] * (ntokens - 1))
+
+    return tokens
+
 def tokenize(path: byte*) -> Token*:
     file = fopen(path, "rb")
     if file == NULL:
         # TODO: test this
         # TODO: include errno in the message
         fail(Location{path=path}, "cannot open file")
-    raw_tokens = tokenize_without_indent_dedent_tokens(file, path)
-    # TODO: handle indentations and such
-    return raw_tokens
 
+    raw_tokens = tokenize_without_indent_dedent_tokens(file, path)
+    better_tokens = handle_indentations(raw_tokens)
+    free(raw_tokens)
+    return better_tokens
 
-def main() -> int:
-    tokens = tokenize("../examples/hello.jou")
-
+def print_tokens(tokens: Token*) -> void:
+    printf("===== Tokens for file \"%s\" =====\n", tokens->location.path)
     t = tokens
+    current_lineno = -1
+
     while True:
+        if t->location.lineno != current_lineno:
+            current_lineno = t->location.lineno
+            printf("\nLine %d:\n", current_lineno)
+
+        printf("  ")
         print_token(t)
-        # TODO: Shouldn't need parentheses.
-        if (t++)->kind == TokenKind::EndOfFile:
+
+        if t->kind == TokenKind::EndOfFile:
             break
+        t++
 
+def main() -> int:
+    tokens = tokenize("../examples/hello.jou")
+    print_tokens(tokens)
     free(tokens)
     return 0
diff --git a/src/print.c b/src/print.c
index 215fca53..4588c080 100644
--- a/src/print.c
+++ b/src/print.c
@@ -93,10 +93,10 @@ void print_token(const Token *token)
         printf("end of file\n");
         break;
     case TOKEN_INDENT:
-        printf("more indentation (+4 spaces)\n");
+        printf("indent (+4 spaces)\n");
         break;
     case TOKEN_DEDENT:
-        printf("less indentation (-4 spaces)\n");
+        printf("dedent (-4 spaces)\n");
         break;
     case TOKEN_OPERATOR:
         printf("operator '%s'\n", token->data.operator);
diff --git a/stdlib/mem.jou b/stdlib/mem.jou
index bd905113..b71d01e8 100644
--- a/stdlib/mem.jou
+++ b/stdlib/mem.jou
@@ -8,3 +8,4 @@ declare free(ptr: void*) -> void
 
 declare memset(dest: void*, fill_byte: int, count: long) -> void*
 declare memcpy(dest: void*, source: void*, count: long) -> void*
+declare memmove(dest: void*, source: void*, count: long) -> void*

From 598cde2bf7474494da97c6f5e6f97a0d526cfe53 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 22:52:56 +0200
Subject: [PATCH 07/15] Add tests for the self-hosted tokenizer

---
 .github/workflows/linux.yml           |  36 +++--
 .github/workflows/windows.yml         |  14 ++
 self_hosted/tokenizer.jou             |   7 +-
 self_hosted/tokenizes_wrong.txt       | 186 ++++++++++++++++++++++++++
 src/jou_compiler.h                    |   1 +
 src/main.c                            |  13 +-
 tests/should_succeed/compiler_cli.jou |   1 +
 tokenizers.sh                         |  49 +++++++
 8 files changed, 290 insertions(+), 17 deletions(-)
 create mode 100644 self_hosted/tokenizes_wrong.txt
 create mode 100755 tokenizers.sh

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 9486d398..73d4b72f 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -13,17 +13,25 @@ jobs:
         # Testing all levels because there was a bug that only happened with -O1. (#224)
         opt-level: ['-O0', '-O1', '-O2', '-O3']
     steps:
-    - uses: actions/checkout@v3
-    - run: sudo apt install -y llvm-${{ matrix.llvm-version }}-dev clang-${{ matrix.llvm-version }} make valgrind
-    - run: LLVM_CONFIG=llvm-config-${{ matrix.llvm-version }} make
-    - run: ./runtests.sh --verbose './jou ${{ matrix.opt-level }} %s'
-    - run: ./runtests.sh --verbose './jou ${{ matrix.opt-level }} --verbose %s'
-    - run: ./runtests.sh --verbose --valgrind './jou ${{ matrix.opt-level }} %s'
-    # valgrind+verbose isn't meaningful: test script would ignore valgrind output
-    - run: make clean
-    - name: Check that "make clean" deleted all files not committed to Git
-      run: |
-        if [ "$(git status --porcelain --ignored)" != "" ]; then
-          git status --ignored
-          exit 1
-        fi
+      - uses: actions/checkout@v3
+      - run: sudo apt install -y llvm-${{ matrix.llvm-version }}-dev clang-${{ matrix.llvm-version }} make valgrind
+      - run: LLVM_CONFIG=llvm-config-${{ matrix.llvm-version }} make
+      - run: ./runtests.sh --verbose './jou ${{ matrix.opt-level }} %s'
+      - run: ./runtests.sh --verbose './jou ${{ matrix.opt-level }} --verbose %s'
+      - run: ./runtests.sh --verbose --valgrind './jou ${{ matrix.opt-level }} %s'
+      # valgrind+verbose isn't meaningful: test script would ignore valgrind output
+      - run: make clean
+      - name: Check that "make clean" deleted all files not committed to Git
+        run: |
+          if [ "$(git status --porcelain --ignored)" != "" ]; then
+            git status --ignored
+            exit 1
+          fi
+
+  tokenizers:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - run: sudo apt install -y llvm-13-dev clang-13 make valgrind
+      - run: LLVM_CONFIG=llvm-config-13 make
+      - run: ./tokenizers.sh
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 6fcad65a..ffd9e3ce 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -131,3 +131,17 @@ jobs:
         shell: bash
       - run: cd "test dir" && ./runtests.sh --verbose
         shell: bash
+
+  tokenizers:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/download-artifact@v3
+        with:
+          name: windows-zip
+      - run: unzip jou.zip
+      - run: mv jou/* .
+        shell: bash
+      - run: ./tokenizers.sh
+        shell: bash
diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou
index 72523298..94cfc3a0 100644
--- a/self_hosted/tokenizer.jou
+++ b/self_hosted/tokenizer.jou
@@ -486,8 +486,11 @@ def print_tokens(tokens: Token*) -> void:
             break
         t++
 
-def main() -> int:
-    tokens = tokenize("../examples/hello.jou")
+    printf("\n")
+
+def main(argc: int, argv: byte**) -> int:
+    assert(argc == 2)
+    tokens = tokenize(argv[1])
     print_tokens(tokens)
     free(tokens)
     return 0
diff --git a/self_hosted/tokenizes_wrong.txt b/self_hosted/tokenizes_wrong.txt
new file mode 100644
index 00000000..19b9e63b
--- /dev/null
+++ b/self_hosted/tokenizes_wrong.txt
@@ -0,0 +1,186 @@
+# This is a list of files that are not yet supported by the self-hosted compiler.
+examples/fib.jou
+examples/x11_window.jou
+tests/syntax_error/import_after_def.jou
+tests/syntax_error/missing_return_type.jou
+tests/syntax_error/missing_arg_type.jou
+tests/syntax_error/declare_global_with_value.jou
+tests/syntax_error/arg_after_dotdotdot.jou
+tests/syntax_error/chained_eq.jou
+tests/syntax_error/bad_type.jou
+tests/syntax_error/missing_import_keyword.jou
+tests/syntax_error/missing_second_equal_sign.jou
+tests/syntax_error/hex.jou
+tests/syntax_error/double_assignment.jou
+tests/syntax_error/double_with_letters_after.jou
+tests/syntax_error/dot_after_e.jou
+tests/syntax_error/unnecessary_zero.jou
+tests/syntax_error/bad_addressof.jou
+tests/syntax_error/bin.jou
+tests/syntax_error/arg_default.jou
+tests/syntax_error/def_missing_args.jou
+tests/syntax_error/bad_expression.jou
+tests/syntax_error/missing_field_names.jou
+tests/syntax_error/bad_field.jou
+tests/syntax_error/string_zero_byte.jou
+tests/syntax_error/triple_equals.jou
+tests/syntax_error/import_missing_quotes.jou
+tests/syntax_error/unknown_escape_ascii.jou
+tests/syntax_error/bad_struct_field_name.jou
+tests/syntax_error/python_style_for.jou
+tests/syntax_error/import1.jou
+tests/syntax_error/and_or_chaining.jou
+tests/syntax_error/bad_function_name_after_def.jou
+tests/syntax_error/empty_char.jou
+tests/syntax_error/import_missing_comma_with_parens.jou
+tests/syntax_error/2bad.jou
+tests/syntax_error/array_size.jou
+tests/syntax_error/multidot_float.jou
+tests/syntax_error/0b2.jou
+tests/syntax_error/ee.jou
+tests/syntax_error/overlong_char.jou
+tests/syntax_error/dotdotdot_dotdotdot.jou
+tests/syntax_error/bad_byte.jou
+tests/syntax_error/first_line_indent.jou
+tests/syntax_error/too_many_closing_parens.jou
+tests/syntax_error/indentation_not4.jou
+tests/syntax_error/import_missing_dot.jou
+tests/syntax_error/unknown_escape_multibyte.jou
+tests/syntax_error/infinite_c_style_for.jou
+tests/syntax_error/struct_missing_type.jou
+tests/syntax_error/bad_toplevel_declaration.jou
+tests/syntax_error/missing_indentation.jou
+tests/syntax_error/import_missing_comma.jou
+tests/syntax_error/bad_argument_name.jou
+tests/syntax_error/mismatched_close_brace.jou
+tests/syntax_error/bad_statement.jou
+tests/syntax_error/indexing.jou
+tests/syntax_error/struct_init_js_syntax.jou
+tests/syntax_error/missing_colon.jou
+tests/syntax_error/missing_number_after_eminus.jou
+tests/syntax_error/bad_struct_name.jou
+tests/syntax_error/missing_number_after_e.jou
+tests/syntax_error/chained_le.jou
+tests/syntax_error/float.jou
+tests/syntax_error/double_not.jou
+tests/syntax_error/too_many_opening_parens.jou
+tests/syntax_error/struct_default.jou
+tests/crash/null_deref.jou
+tests/wrong_type/assign_void.jou
+tests/wrong_type/arg.jou
+tests/wrong_type/assign_with_type.jou
+tests/wrong_type/while.jou
+tests/wrong_type/var_assignment.jou
+tests/wrong_type/cannot_be_indexed.jou
+tests/wrong_type/float_and_double.jou
+tests/wrong_type/struct_member_assign.jou
+tests/wrong_type/deref_non_pointer.jou
+tests/wrong_type/elif.jou
+tests/wrong_type/int_to_enum.jou
+tests/wrong_type/arrow_operator_not_struct.jou
+tests/wrong_type/enum_member_from_struct.jou
+tests/wrong_type/enum_to_int.jou
+tests/wrong_type/assign_to_deref_non_pointer.jou
+tests/wrong_type/index.jou
+tests/wrong_type/dot_operator.jou
+tests/wrong_type/neg.jou
+tests/wrong_type/array_to_ptr.jou
+tests/wrong_type/arg_with_varargs.jou
+tests/wrong_type/inplace_add_doesnt_go_back.jou
+tests/wrong_type/mod.jou
+tests/wrong_type/pointer_assignment.jou
+tests/wrong_type/for.jou
+tests/wrong_type/bool_main.jou
+tests/wrong_type/plusplus.jou
+tests/wrong_type/not.jou
+tests/wrong_type/brace_init_arg.jou
+tests/wrong_type/return_value.jou
+tests/wrong_type/if.jou
+tests/wrong_type/struct_member_init.jou
+tests/wrong_type/void_main.jou
+tests/wrong_type/array_vararg.jou
+tests/wrong_type/pointer_eq.jou
+tests/wrong_type/or.jou
+tests/wrong_type/arrow_operator_not_pointer.jou
+tests/should_succeed/assign.jou
+tests/should_succeed/enum.jou
+tests/should_succeed/string_syntax.jou
+tests/should_succeed/octalnuber.jou
+tests/should_succeed/sizeof.jou
+tests/should_succeed/global_bug.jou
+tests/should_succeed/crlf.jou
+tests/should_succeed/loops.jou
+tests/should_succeed/add_sub_mul_div_mod.jou
+tests/should_succeed/printf.jou
+tests/should_succeed/undefined_value_warning.jou
+tests/should_succeed/global.jou
+tests/should_succeed/pointer.jou
+tests/should_succeed/mathlibtest.jou
+tests/should_succeed/sscanf.jou
+tests/should_succeed/plusplus_minusminus.jou
+tests/should_succeed/stderr.jou
+tests/should_succeed/return_void.jou
+tests/should_succeed/unreachable_warning.jou
+tests/should_succeed/local_import.jou
+tests/should_succeed/array.jou
+tests/should_succeed/compare.jou
+tests/should_succeed/and_or_not.jou
+tests/should_succeed/struct.jou
+tests/should_succeed/argument.jou
+tests/should_succeed/compiler_cli.jou
+tests/should_succeed/file.jou
+tests/should_succeed/expfloat.jou
+tests/should_succeed/implicit_conversions.jou
+tests/should_succeed/return_string.jou
+tests/should_succeed/as.jou
+tests/should_succeed/if_elif_else.jou
+tests/should_succeed/unused_import.jou
+tests/other_errors/missing_return.jou
+tests/other_errors/brace_init_dupe.jou
+tests/other_errors/double_plusplus.jou
+tests/other_errors/array0.jou
+tests/other_errors/address_of_minusminus.jou
+tests/other_errors/function_wrong_n_args.jou
+tests/other_errors/duplicate_enum_member.jou
+tests/other_errors/continue_outside_loop.jou
+tests/other_errors/varargs_def.jou
+tests/other_errors/runtime_return_1.jou
+tests/other_errors/dumb_assignment.jou
+tests/other_errors/dynamic_array_length.jou
+tests/other_errors/var_shadow.jou
+tests/other_errors/address_of_array_indexing.jou
+tests/other_errors/redefine_imported_func.jou
+tests/other_errors/struct_already_exists.jou
+tests/other_errors/imported_error.jou
+tests/other_errors/break_outside_loop.jou
+tests/other_errors/void_as_type.jou
+tests/other_errors/dumb_assignment_with_plusequals.jou
+tests/other_errors/using_void_function.jou
+tests/other_errors/immediate_member_assign.jou
+tests/other_errors/unexpected_return_value.jou
+tests/other_errors/duplicate_arg_name.jou
+tests/other_errors/missing_value_in_return.jou
+tests/other_errors/duplicate_field_name.jou
+tests/404/enum.jou
+tests/404/function.jou
+tests/404/var.jou
+tests/404/type.jou
+tests/404/import_wrong_func.jou
+tests/404/struct_field.jou
+tests/404/enum_member.jou
+tests/404/import_symbol.jou
+tests/404/var_addressof.jou
+tests/404/file.jou
+tests/404/import_symbol_multiline.jou
+tests/already_exists_error/global_var.jou
+tests/already_exists_error/struct_import.jou
+tests/already_exists_error/global_var_import.jou
+tests/already_exists_error/struct.jou
+tests/already_exists_error/func.jou
+tests/already_exists_error/local_var.jou
+tests/already_exists_error/func_import.jou
+tests/already_exists_error/struct_and_enum.jou
+tests/too_long/long.jou
+tests/too_long/nested_parentheses.jou
+tests/too_long/name.jou
+tests/too_long/int.jou
diff --git a/src/jou_compiler.h b/src/jou_compiler.h
index 69f99104..a7b93f5a 100644
--- a/src/jou_compiler.h
+++ b/src/jou_compiler.h
@@ -49,6 +49,7 @@ typedef struct CfInstruction CfInstruction;
 
 struct CommandLineFlags {
     bool verbose;  // Whether to print a LOT of debug info
+    bool tokenize_only;
     int optlevel;  // Optimization level (0 don't optimize, 3 optimize a lot)
     const char *outfile;  // If not NULL, where to output executable
     const char *linker_flags;  // String that is appended to linking command
diff --git a/src/main.c b/src/main.c
index 1950d5f6..fb459fa7 100644
--- a/src/main.c
+++ b/src/main.c
@@ -42,6 +42,7 @@ static const char help_fmt[] =
     "  -o OUTFILE       output an executable file, don't run the code\n"
     "  -O0/-O1/-O2/-O3  set optimization level (0 = default, 3 = runs fastest)\n"
     "  --verbose        display a lot of information about all compilation steps\n"
+    "  --tokenize-only  display only the output of the tokenizer, and don't run other compile steps\n"
     "  --linker-flags   appended to the linker command, so you can use external libraries\n"
     ;
 
@@ -76,6 +77,13 @@ static void parse_arguments(int argc, char **argv, CommandLineFlags *flags, cons
         } else if (!strcmp(argv[i], "--verbose")) {
             flags->verbose = true;
             i++;
+        } else if (!strcmp(argv[i], "--tokenize-only")) {
+            if (argc > 3) {
+                fprintf(stderr, "%s: --tokenize-only cannot be used together with other flags", argv[0]);
+                goto wrong_usage;
+            }
+            flags->tokenize_only = true;
+            i++;
         } else if (!strcmp(argv[i], "--linker-flags")) {
             if (flags->linker_flags) {
                 fprintf(stderr, "%s: --linker-flags cannot be given multiple times", argv[0]);
@@ -173,8 +181,11 @@ static void parse_file(struct CompileState *compst, const char *filename, const
     }
     Token *tokens = tokenize(f, fs.path);
     fclose(f);
-    if(compst->flags.verbose)
+
+    if(compst->flags.verbose || compst->flags.tokenize_only)
         print_tokens(tokens);
+    if (compst->flags.tokenize_only)
+        exit(0);
 
     fs.ast = parse(tokens, compst->stdlib_path);
     free_tokens(tokens);
diff --git a/tests/should_succeed/compiler_cli.jou b/tests/should_succeed/compiler_cli.jou
index 1dfe419b..1a0b97c5 100644
--- a/tests/should_succeed/compiler_cli.jou
+++ b/tests/should_succeed/compiler_cli.jou
@@ -30,6 +30,7 @@ def main() -> int:
     run_jou("lolwat.jou")  # Output: compiler error in file "lolwat.jou": cannot open file: No such file or directory
     run_jou("--linker-flags")  # Output: <jouexe>: there must be a string of flags after --linker-flags (try "<jouexe> --help")
     run_jou("--linker-flags x --linker-flags y")  # Output: <jouexe>: --linker-flags cannot be given multiple times (try "<jouexe> --help")
+    run_jou("--tokenize-only -O1 examples/hello.jou")  # Output: <jouexe>: --tokenize-only cannot be used together with other flags (try "<jouexe> --help")
 
     # Output: Usage:
     # Output:   <jouexe> [-o OUTFILE] [-O0|-O1|-O2|-O3] [--verbose] [--linker-flags "..."] FILENAME
diff --git a/tokenizers.sh b/tokenizers.sh
new file mode 100755
index 00000000..46fdbe02
--- /dev/null
+++ b/tokenizers.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+#
+# There are two Jou compilers: one written in C and another written in Jou.
+# They should be able to tokenize each Jou file in exactly the same way.
+# If tokenizing a Jou file fails, both tokenizers should fail with the same error message.
+
+if [[ "$OS" =~ Windows ]]; then
+    dotexe=.exe
+else
+    dotexe=
+fi
+
+set -e
+
+rm -rf tmp/tokenizers
+mkdir -v tmp/tokenizers
+
+echo "Compiling the self-hosted compiler..."
+./jou${dotexe} -O1 -o tmp/tokenizers/self_hosted${dotexe} self_hosted/tokenizer.jou 
+
+for file in $(find examples tests -name '*.jou'); do
+    echo $file
+    (./jou${dotexe} --tokenize-only $file || true) &> tmp/tokenizers/compiler_written_in_c.txt
+    (tmp/tokenizers/self_hosted${dotexe} $file || true) &> tmp/tokenizers/self_hosted.txt
+
+    if grep -qxF $file self_hosted/tokenizes_wrong.txt; then
+        # The file is skipped, so the two compilers should behave differently
+        if diff tmp/tokenizers/compiler_written_in_c.txt tmp/tokenizers/self_hosted.txt >/dev/null; then
+            echo "  Error: Tokenizers behave the same even though the file is listed in self_hosted/tokenizes_wrong.txt."
+            echo "  To fix this error, delete the \"$file\" line from self_hosted/tokenizes_wrong.txt."
+            exit 1
+        else
+            echo "  Tokenizers behave differently as expected (listed in self_hosted_skip.txt)"
+        fi
+    else
+        if diff -u --color=always tmp/tokenizers/compiler_written_in_c.txt tmp/tokenizers/self_hosted.txt; then
+            echo "  Tokenizers behave the same as expected"
+        else
+            echo "  Error: Tokenizers behave differently when given \"$file\"."
+            echo "  You can silence this error by adding \"$file\" to tmp/tokenizers/self_hosted.txt."
+            echo "  Ideally the tokenizers would behave in the same way for all files, but we aren't there yet."
+            exit 1
+        fi
+    fi
+done
+
+echo ""
+echo ""
+echo "success :)"

From 6bd21dba704d97c6eb9aa113bac7515006bc74f6 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 23:03:18 +0200
Subject: [PATCH 08/15] fix test

---
 src/main.c                            | 2 +-
 tests/should_succeed/compiler_cli.jou | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/main.c b/src/main.c
index fb459fa7..8241fc0a 100644
--- a/src/main.c
+++ b/src/main.c
@@ -42,7 +42,7 @@ static const char help_fmt[] =
     "  -o OUTFILE       output an executable file, don't run the code\n"
     "  -O0/-O1/-O2/-O3  set optimization level (0 = default, 3 = runs fastest)\n"
     "  --verbose        display a lot of information about all compilation steps\n"
-    "  --tokenize-only  display only the output of the tokenizer, and don't run other compile steps\n"
+    "  --tokenize-only  display only the output of the tokenizer, don't do anything else\n"
     "  --linker-flags   appended to the linker command, so you can use external libraries\n"
     ;
 
diff --git a/tests/should_succeed/compiler_cli.jou b/tests/should_succeed/compiler_cli.jou
index 1a0b97c5..328908af 100644
--- a/tests/should_succeed/compiler_cli.jou
+++ b/tests/should_succeed/compiler_cli.jou
@@ -41,6 +41,7 @@ def main() -> int:
     # Output:   -o OUTFILE       output an executable file, don't run the code
     # Output:   -O0/-O1/-O2/-O3  set optimization level (0 = default, 3 = runs fastest)
     # Output:   --verbose        display a lot of information about all compilation steps
+    # Output:   --tokenize-only  display only the output of the tokenizer, don't do anything else
     # Output:   --linker-flags   appended to the linker command, so you can use external libraries
     run_jou("--help")
 

From 70f0c4a362fac5d9bc6aed3c005e77caa5f26c92 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 23:03:42 +0200
Subject: [PATCH 09/15] fix script

---
 tokenizers.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tokenizers.sh b/tokenizers.sh
index 46fdbe02..7b68999e 100755
--- a/tokenizers.sh
+++ b/tokenizers.sh
@@ -13,7 +13,7 @@ fi
 set -e
 
 rm -rf tmp/tokenizers
-mkdir -v tmp/tokenizers
+mkdir -vp tmp/tokenizers
 
 echo "Compiling the self-hosted compiler..."
 ./jou${dotexe} -O1 -o tmp/tokenizers/self_hosted${dotexe} self_hosted/tokenizer.jou 

From 4998a7506e1c1732b5a0b52aa26acd0166cf8c61 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 23:05:25 +0200
Subject: [PATCH 10/15] fix echo

---
 tokenizers.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tokenizers.sh b/tokenizers.sh
index 7b68999e..86b14f5d 100755
--- a/tokenizers.sh
+++ b/tokenizers.sh
@@ -30,7 +30,7 @@ for file in $(find examples tests -name '*.jou'); do
             echo "  To fix this error, delete the \"$file\" line from self_hosted/tokenizes_wrong.txt."
             exit 1
         else
-            echo "  Tokenizers behave differently as expected (listed in self_hosted_skip.txt)"
+            echo "  Tokenizers behave differently as expected (listed in self_hosted/tokenizes_wrong.txt)"
         fi
     else
         if diff -u --color=always tmp/tokenizers/compiler_written_in_c.txt tmp/tokenizers/self_hosted.txt; then

From 34249b5e3a0589188adaf1cb439c7fd2d6961eef Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 23:12:21 +0200
Subject: [PATCH 11/15] Lets try this

---
 .github/workflows/windows.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index ffd9e3ce..6de12afa 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -141,7 +141,7 @@ jobs:
         with:
           name: windows-zip
       - run: unzip jou.zip
-      - run: mv jou/* .
+      - run: mv tokenizers.sh jou
         shell: bash
-      - run: ./tokenizers.sh
+      - run: (cd jou && ./tokenizers.sh)
         shell: bash

From eebdaa6ba3efe3c6ca26d9733e21e060f43868fb Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 23:14:50 +0200
Subject: [PATCH 12/15] lol

---
 .github/workflows/windows.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 6de12afa..401e6505 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -134,7 +134,7 @@ jobs:
 
   tokenizers:
     needs: build
-    runs-on: ubuntu-latest
+    runs-on: windows-latest
     steps:
       - uses: actions/checkout@v3
       - uses: actions/download-artifact@v3

From 2e301ef83ff35020ac432c853c9a0d423b9cff14 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 23:32:36 +0200
Subject: [PATCH 13/15] Apply suggestions from code review

---
 self_hosted/tokenizer.jou       | 6 ++++--
 self_hosted/tokenizes_wrong.txt | 2 +-
 src/jou_compiler.h              | 2 +-
 stdlib/io.jou                   | 2 +-
 stdlib/mem.jou                  | 1 +
 5 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou
index 94cfc3a0..bfca58ad 100644
--- a/self_hosted/tokenizer.jou
+++ b/self_hosted/tokenizer.jou
@@ -31,6 +31,7 @@ struct Token:
     short_string: byte[100] # Name, Keyword, Operator
     long_string: byte*      # String
 
+# TODO: import this (#227 maybe?)
 declare isprint(b: int) -> int
 
 def print_token(token: Token*) -> void:
@@ -130,8 +131,7 @@ def is_identifier_or_number_byte(b: byte) -> bool:
 
 def read_identifier_or_number(self: Tokenizer*, first_byte: byte) -> byte[100]:
     dest: byte[100]
-    for i = 0; i < 100; i++:  # TODO: memset
-        dest[i] = '\0'
+    memset(&dest, 0, sizeof dest)
     destlen = 0
 
     assert(is_identifier_or_number_byte(first_byte))
@@ -230,6 +230,7 @@ def is_operator_byte(c: byte) -> bool:
 
 declare strncmp(s1: byte*, s2: byte*, n: long) -> int
 
+# TODO: move to stdlib
 def starts_with(s: byte*, prefix: byte*) -> bool:
     return strncmp(s, prefix, strlen(prefix)) == 0
 
@@ -344,6 +345,7 @@ def is_keyword(word: byte*) -> bool:
             return True
     return False
 
+# TODO: move to stdlib
 declare atoi(s: byte*) -> int
 
 def read_token(self: Tokenizer*) -> Token:
diff --git a/self_hosted/tokenizes_wrong.txt b/self_hosted/tokenizes_wrong.txt
index 19b9e63b..b54eac84 100644
--- a/self_hosted/tokenizes_wrong.txt
+++ b/self_hosted/tokenizes_wrong.txt
@@ -1,4 +1,4 @@
-# This is a list of files that are not yet supported by the self-hosted compiler.
+# This is a list of files that are not yet supported by the tokenizer of the self-hosted compiler.
 examples/fib.jou
 examples/x11_window.jou
 tests/syntax_error/import_after_def.jou
diff --git a/src/jou_compiler.h b/src/jou_compiler.h
index a7b93f5a..bbdadb15 100644
--- a/src/jou_compiler.h
+++ b/src/jou_compiler.h
@@ -49,7 +49,7 @@ typedef struct CfInstruction CfInstruction;
 
 struct CommandLineFlags {
     bool verbose;  // Whether to print a LOT of debug info
-    bool tokenize_only;
+    bool tokenize_only;  // If true, tokenize the file passed on command line and don't actually compile anything
     int optlevel;  // Optimization level (0 don't optimize, 3 optimize a lot)
     const char *outfile;  // If not NULL, where to output executable
     const char *linker_flags;  // String that is appended to linking command
diff --git a/stdlib/io.jou b/stdlib/io.jou
index e0bfeb96..842e972b 100644
--- a/stdlib/io.jou
+++ b/stdlib/io.jou
@@ -61,7 +61,7 @@ declare fgetc(file: FILE*) -> int  # see getchar()
 declare fscanf(file: FILE*, pattern: byte*, ...) -> int
 
 # Ensure that output is actually written. It may remain buffered
-# without calling this function.
+# if this function isn't called.
 declare fflush(file: FILE*) -> int
 
 # Read a line of text from file into a string starting at the given
diff --git a/stdlib/mem.jou b/stdlib/mem.jou
index b71d01e8..bf303cfc 100644
--- a/stdlib/mem.jou
+++ b/stdlib/mem.jou
@@ -6,6 +6,7 @@ declare malloc(size: long) -> void*
 declare realloc(ptr: void*, size: long) -> void*
 declare free(ptr: void*) -> void
 
+# TODO: explain what each of these does
 declare memset(dest: void*, fill_byte: int, count: long) -> void*
 declare memcpy(dest: void*, source: void*, count: long) -> void*
 declare memmove(dest: void*, source: void*, count: long) -> void*

From 334403d3994fb93c870849d86156e53a125dd900 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 23:32:40 +0200
Subject: [PATCH 14/15] Fixening

---
 .github/workflows/windows.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 401e6505..c4a5966d 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -141,7 +141,7 @@ jobs:
         with:
           name: windows-zip
       - run: unzip jou.zip
-      - run: mv tokenizers.sh jou
+      - run: mv tokenizers.sh self_hosted jou
         shell: bash
       - run: (cd jou && ./tokenizers.sh)
         shell: bash

From e233ca067d025677d011acb298ea7ac30faa0500 Mon Sep 17 00:00:00 2001
From: Akuli <akuviljanen17@gmail.com>
Date: Fri, 24 Feb 2023 23:46:15 +0200
Subject: [PATCH 15/15] cleanup/fix

---
 src/main.c | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/main.c b/src/main.c
index 8241fc0a..d5e385da 100644
--- a/src/main.c
+++ b/src/main.c
@@ -165,6 +165,18 @@ static struct FileState *find_file(const struct CompileState *compst, const char
     return NULL;
 }
 
+static FILE *open_the_file(const char *path, const Location *import_location)
+{
+    FILE *f = fopen(path, "rb");
+    if (!f) {
+        if (import_location)
+            fail_with_error(*import_location, "cannot import from \"%s\": %s", path, strerror(errno));
+        else
+            fail_with_error((Location){.filename=path}, "cannot open file: %s", strerror(errno));
+    }
+    return f;
+}
+
 static void parse_file(struct CompileState *compst, const char *filename, const Location *import_location)
 {
     if (find_file(compst, filename))
@@ -172,20 +184,12 @@ static void parse_file(struct CompileState *compst, const char *filename, const
 
     struct FileState fs = { .path = strdup(filename) };
 
-    FILE *f = fopen(fs.path, "rb");
-    if (!f) {
-        if (import_location)
-            fail_with_error(*import_location, "cannot import from \"%s\": %s", filename, strerror(errno));
-        else
-            fail_with_error((Location){.filename=filename}, "cannot open file: %s", strerror(errno));
-    }
+    FILE *f = open_the_file(fs.path, import_location);
     Token *tokens = tokenize(f, fs.path);
     fclose(f);
 
-    if(compst->flags.verbose || compst->flags.tokenize_only)
+    if(compst->flags.verbose)
         print_tokens(tokens);
-    if (compst->flags.tokenize_only)
-        exit(0);
 
     fs.ast = parse(tokens, compst->stdlib_path);
     free_tokens(tokens);
@@ -405,6 +409,15 @@ int main(int argc, char **argv)
         printf("Data layout: %s\n", get_target()->data_layout);
     }
 
+    if (compst.flags.tokenize_only) {
+        FILE *f = open_the_file(filename, NULL);
+        Token *tokens = tokenize(f, filename);
+        fclose(f);
+        print_tokens(tokens);
+        free_tokens(tokens);
+        return 0;
+    }
+
 #ifdef _WIN32
     char *startup_path = malloc(strlen(compst.stdlib_path) + 50);
     sprintf(startup_path, "%s/_windows_startup.jou", compst.stdlib_path);