diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 4c9dcb7a..73d4b72f 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -27,3 +27,11 @@ jobs: git status --ignored exit 1 fi + + tokenizers: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - run: sudo apt install -y llvm-13-dev clang-13 make valgrind + - run: LLVM_CONFIG=llvm-config-13 make + - run: ./tokenizers.sh diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 6fcad65a..c4a5966d 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -131,3 +131,17 @@ jobs: shell: bash - run: cd "test dir" && ./runtests.sh --verbose shell: bash + + tokenizers: + needs: build + runs-on: windows-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/download-artifact@v3 + with: + name: windows-zip + - run: unzip jou.zip + - run: mv tokenizers.sh self_hosted jou + shell: bash + - run: (cd jou && ./tokenizers.sh) + shell: bash diff --git a/self_hosted/errors_and_warnings.jou b/self_hosted/errors_and_warnings.jou new file mode 100644 index 00000000..663461d9 --- /dev/null +++ b/self_hosted/errors_and_warnings.jou @@ -0,0 +1,25 @@ +from "stdlib/process.jou" import exit +from "stdlib/io.jou" import stdout, stderr, fprintf, fflush + +struct Location: + path: byte* # Not owned. Points to a string that is held elsewhere. + lineno: int + +def fail(location: Location, message: byte*) -> void: + # When stdout is redirected to same place as stderr, + # make sure that normal printf()s show up before our error. + fflush(stdout) + fflush(stderr) + + fprintf(stderr, "compiler error in file \"%s\"", location.path) + if location.lineno != 0: + fprintf(stderr, ", line %d", location.lineno) + fprintf(stderr, ": %s\n", message) + + exit(1) + +# TODO: doesn't really belong here +def assert(b: bool) -> void: + if not b: + fprintf(stderr, "assertion failed\n") + exit(1) diff --git a/self_hosted/tokenizer.jou b/self_hosted/tokenizer.jou new file mode 100644 index 00000000..bfca58ad --- /dev/null +++ b/self_hosted/tokenizer.jou @@ -0,0 +1,498 @@ +from "stdlib/io.jou" import printf, FILE, fgetc, ferror, fopen +from "stdlib/str.jou" import sprintf, strlen, strchr, strcmp +from "stdlib/mem.jou" import malloc, realloc, free, memset, memmove +from "./errors_and_warnings.jou" import Location, fail, assert + +enum TokenKind: + Int + Long + Float + Double + Byte # example: 'a' is 97 as a byte + String + Name + Keyword + Newline + Indent + Dedent + Operator + EndOfFile # Marks the end of an array of tokens. + +struct Token: + kind: TokenKind + location: Location + + # Only one of these is used at a time. + # TODO: union + int_value: int # Int + long_value: long # Long + byte_value: byte # Byte + indentation_level: int # Newline (indicates how many spaces there are after the newline) + short_string: byte[100] # Name, Keyword, Operator + long_string: byte* # String + +# TODO: import this (#227 maybe?) +declare isprint(b: int) -> int + +def print_token(token: Token*) -> void: + if token->kind == TokenKind::Int: + printf("integer %d\n", token->int_value) + elif token->kind == TokenKind::Long: + printf("long %lld\n", token->long_value) + elif token->kind == TokenKind::Float: + printf("float %s\n", &token->short_string[0]) + elif token->kind == TokenKind::Double: + printf("double %s\n", &token->short_string[0]) + elif token->kind == TokenKind::Byte: + printf("character %#02x", token->byte_value) + if isprint(token->byte_value) != 0: + printf(" '%c'", token->byte_value) + printf("\n") + elif token->kind == TokenKind::EndOfFile: + printf("end of file\n") + elif token->kind == TokenKind::Operator: + printf("operator '%s'\n", &token->short_string[0]) + elif token->kind == TokenKind::Name: + printf("name \"%s\"\n", &token->short_string[0]) + elif token->kind == TokenKind::Keyword: + printf("keyword \"%s\"\n", &token->short_string[0]) + elif token->kind == TokenKind::Newline: + printf("newline token (next line has %d spaces of indentation)\n", token->indentation_level) + elif token->kind == TokenKind::String: + printf("string \"%s\"\n", token->long_string) + elif token->kind == TokenKind::Indent: + printf("indent (+4 spaces)\n") + elif token->kind == TokenKind::Dedent: + printf("dedent (-4 spaces)\n") + else: + printf("????\n") + +struct Tokenizer: + f: FILE* + location: Location + pushback: byte* + pushback_len: int # TODO: dynamic array + # Parens array isn't dynamic, so that you can't segfault + # the compiler by feeding it lots of nested parentheses, + # which would make it recurse too deep. + parens: Token[50] + parens_len: int + +def read_byte(self: Tokenizer*) -> byte: + EOF = -1 # FIXME + + c: byte + if self->pushback_len > 0: + c = self->pushback[--self->pushback_len] + else: + temp = fgetc(self->f) + if temp == '\r': + # On Windows, \r just before \n is ignored. + temp = fgetc(self->f) + if temp != EOF and temp != '\n': + # TODO: test this, if possible? + fail(self->location, "source file contains a CR byte ('\\r') that isn't a part of a CRLF line ending") + + if temp == EOF: + if ferror(self->f) != 0: + # TODO: include errno in the error message + fail(self->location, "cannot read file") + # Use the zero byte to denote end of file. + c = '\0' + elif temp == '\0': + # TODO: test this + fail(self->location, "source file contains a zero byte") + c = 'x' # TODO: silences compiler warning, but never runs + else: + c = temp as byte + + if c == '\n': + self->location.lineno++ + return c + + +def unread_byte(self: Tokenizer*, b: byte) -> void: + if b == '\0': + return + + assert(b != '\r') + self->pushback = realloc(self->pushback, self->pushback_len + 1) + self->pushback[self->pushback_len++] = b + if b == '\n': + self->location.lineno-- + +def is_identifier_or_number_byte(b: byte) -> bool: + return ( + ('A' <= b and b <= 'Z') + or ('a' <= b and b <= 'z') + or ('0' <= b and b <= '9') + or b == '_' + ) + +def read_identifier_or_number(self: Tokenizer*, first_byte: byte) -> byte[100]: + dest: byte[100] + memset(&dest, 0, sizeof dest) + destlen = 0 + + assert(is_identifier_or_number_byte(first_byte)) + dest[destlen++] = first_byte + + while True: + b = read_byte(self) + if is_identifier_or_number_byte(b): + if destlen == sizeof dest - 1: + fail(self->location, "name or number is too long") + dest[destlen++] = b + else: + unread_byte(self, b) + return dest + +def consume_rest_of_line(self: Tokenizer*) -> void: + while True: + c = read_byte(self) + if c == '\0' or c == '\n': + break + +# Returns the indentation level for the next line +def read_newline_token(self: Tokenizer*) -> int: + level = 0 + while True: + c = read_byte(self) + if c == '\0': + # End of file. Do not validate that indentation is a + # multiple of 4 spaces. Add a trailing newline implicitly + # if needed. + # + # TODO: test this + return 0 + elif c == '\n': + level = 0 + elif c == '#': + consume_rest_of_line(self) + level = 0 + elif c == ' ': + level++ + else: + unread_byte(self, c) + return level + +def read_string(self: Tokenizer*) -> byte*: + result: byte* = NULL + len = 0 + + while True: + c = read_byte(self) + if c == '"': + break + elif c == '\n' or c == '\0': + if c == '\n': + self->location.lineno-- + fail(self->location, "missing \" to end the string") + elif c == '\n': + # \n means newline, for example + after_backslash = read_byte(self) + if after_backslash == '\0': + fail(self->location, "missing \" to end the string") + elif after_backslash == '\n': + result = realloc(result, len+1) + result[len++] = '\n' + elif after_backslash == 'r': + result = realloc(result, len+1) + result[len++] = '\r' + elif after_backslash == '\\' or after_backslash == '"': + result = realloc(result, len+1) + result[len++] = after_backslash + elif after_backslash == '0': + fail(self->location, "strings cannot contain zero bytes (\\0), because that is the special end marker byte") + elif '0' <= after_backslash and after_backslash <= '9': + result = realloc(result, len+1) + result[len++] = after_backslash - '0' + elif after_backslash == '\n': + # \ at end of line, string continues on next line + len = len # TODO: pass statement + else: + if after_backslash < 0x80 and isprint(after_backslash) != 0: + message: byte* = malloc(100) + sprintf(message, "unknown escape: '\\%c'", after_backslash) + fail(self->location, message) + else: + fail(self->location, "unknown '\\' escape") + else: + result = realloc(result, len+1) + result[len++] = c + + result = realloc(result, len+1) + result[len] = '\0' + return result + +def is_operator_byte(c: byte) -> bool: + return c != '\0' and strchr("=<>!.,()[]{};:+-*/&%", c) != NULL + +declare strncmp(s1: byte*, s2: byte*, n: long) -> int + +# TODO: move to stdlib +def starts_with(s: byte*, prefix: byte*) -> bool: + return strncmp(s, prefix, strlen(prefix)) == 0 + +def read_operator(self: Tokenizer*) -> byte[100]: + # TODO: nicer array syntax + operators: byte*[100] + i = 0 + # Longer operators first, so that '==' does not parse as '=' '=' + operators[i++] = "..." + operators[i++] = "===" + operators[i++] = "!==" + operators[i++] = "==" + operators[i++] = "!=" + operators[i++] = "->" + operators[i++] = "<=" + operators[i++] = ">=" + operators[i++] = "++" + operators[i++] = "--" + operators[i++] = "+=" + operators[i++] = "-=" + operators[i++] = "*=" + operators[i++] = "/=" + operators[i++] = "%=" + operators[i++] = "::" + operators[i++] = "." + operators[i++] = "," + operators[i++] = ":" + operators[i++] = ";" + operators[i++] = "=" + operators[i++] = "(" + operators[i++] = ")" + operators[i++] = "{" + operators[i++] = "}" + operators[i++] = "[" + operators[i++] = "]" + operators[i++] = "&" + operators[i++] = "%" + operators[i++] = "*" + operators[i++] = "/" + operators[i++] = "+" + operators[i++] = "-" + operators[i++] = "<" + operators[i++] = ">" + operators[i] = NULL + + operator: byte[100] + memset(&operator, 0, sizeof operator) + + # Read as many operator characters as we may need. + while strlen(&operator[0]) < 3: + c = read_byte(self) + if not is_operator_byte(c): + unread_byte(self, c) + break + operator[strlen(&operator[0])] = c + + for op = &operators[0]; *op != NULL; op++: + if starts_with(&operator[0], *op): + # Unread the bytes we didn't use. + while strlen(&operator[0]) > strlen(*op): + last = &operator[strlen(&operator[0]) - 1] + unread_byte(self, *last) + *last = '\0' + + # "===" and "!==" are here only to give a better error message to javascript people. + if strcmp(&operator[0], "===") != 0 and strcmp(&operator[0], "!==") != 0: + return operator + + message: byte[100] + sprintf(&message[0], "there is no '%s' operator", &operator[0]) + fail(self->location, &message[0]) + return operator # TODO: never actually runs, but causes a compiler warning + +def is_keyword(word: byte*) -> bool: + # TODO: better array syntax + keywords: byte*[100] + i = 0 + keywords[i++] = "from" + keywords[i++] = "import" + keywords[i++] = "def" + keywords[i++] = "declare" + keywords[i++] = "struct" + keywords[i++] = "enum" + keywords[i++] = "global" + keywords[i++] = "return" + keywords[i++] = "if" + keywords[i++] = "elif" + keywords[i++] = "else" + keywords[i++] = "while" + keywords[i++] = "for" + keywords[i++] = "break" + keywords[i++] = "continue" + keywords[i++] = "True" + keywords[i++] = "False" + keywords[i++] = "NULL" + keywords[i++] = "and" + keywords[i++] = "or" + keywords[i++] = "not" + keywords[i++] = "as" + keywords[i++] = "sizeof" + keywords[i++] = "void" + keywords[i++] = "bool" + keywords[i++] = "byte" + keywords[i++] = "int" + keywords[i++] = "long" + keywords[i++] = "float" + keywords[i++] = "double" + keywords[i++] = NULL + + for kw = &keywords[0]; *kw != NULL; kw++: + if strcmp(*kw, word) == 0: + return True + return False + +# TODO: move to stdlib +declare atoi(s: byte*) -> int + +def read_token(self: Tokenizer*) -> Token: + while True: + token = Token{location = self->location} + b = read_byte(self) + if b == ' ': + continue + + if b == '\n': + if self->parens_len > 0: + continue + token.kind = TokenKind::Newline + token.indentation_level = read_newline_token(self) + elif b == '"': + token.kind = TokenKind::String + token.long_string = read_string(self) + elif is_identifier_or_number_byte(b): + token.short_string = read_identifier_or_number(self, b) + if is_keyword(&token.short_string[0]): + token.kind = TokenKind::Keyword + elif '0' <= token.short_string[0] and token.short_string[0] <= '9': + # TODO: support various other things + token.kind = TokenKind::Int + token.int_value = atoi(&token.short_string[0]) + else: + token.kind = TokenKind::Name + elif is_operator_byte(b): + unread_byte(self, b) + token.kind = TokenKind::Operator + token.short_string = read_operator(self) + elif b == '\0': + token.kind = TokenKind::EndOfFile + else: + message: byte[100] + sprintf(&message[0], "unexpected byte %#02x", b) + fail(self->location, &message[0]) + return token + +def tokenize_without_indent_dedent_tokens(file: FILE*, path: byte*) -> Token*: + tokenizer = Tokenizer{ + location = Location{path = path}, + f = file, + } + + # Add a fake newline to the beginning. It does a few things: + # * Less special-casing: blank lines in the beginning of the file can + # cause there to be a newline token anyway. + # * It is easier to detect an unexpected indentation in the beginning + # of the file, as it becomes just like any other indentation. + # * Line numbers start at 1. + tokenizer.pushback = malloc(1) + tokenizer.pushback[0] = '\n' + tokenizer.pushback_len = 1 + + tokens: Token* = NULL + len = 0 + while len == 0 or tokens[len-1].kind != TokenKind::EndOfFile: + tokens = realloc(tokens, sizeof(tokens[0]) * (len+1)) + tokens[len++] = read_token(&tokenizer) + + free(tokenizer.pushback) + return tokens + +# Creates a new array of tokens with indent/dedent tokens added after +# newline tokens that change the indentation level. +def handle_indentations(raw_tokens: Token*) -> Token*: + tokens: Token* = NULL + ntokens = 0 + level = 0 + + for t = raw_tokens; True; t++: + if t->kind == TokenKind::EndOfFile: + # Add an extra newline token at end of file and the dedents after it. + # This makes it similar to how other newline and dedent tokens work: + # the dedents always come after a newline token. + tokens = realloc(tokens, sizeof tokens[0] * (ntokens + level/4 + 1)) + while level != 0: + tokens[ntokens++] = Token{location = t->location, kind = TokenKind::Dedent} + level -= 4 + tokens[ntokens++] = *t + break + + tokens = realloc(tokens, sizeof tokens[0] * (ntokens+1)) + tokens[ntokens++] = *t + + if t->kind == TokenKind::Newline: + after_newline = t->location + after_newline.lineno++ + + if t->indentation_level % 4 != 0: + fail(after_newline, "indentation must be a multiple of 4 spaces") + + while level < t->indentation_level: + tokens = realloc(tokens, sizeof tokens[0] * (ntokens+1)) + tokens[ntokens++] = Token{location = after_newline, kind = TokenKind::Indent} + level += 4 + + while level > t->indentation_level: + tokens = realloc(tokens, sizeof tokens[0] * (ntokens+1)) + tokens[ntokens++] = Token{location = after_newline, kind = TokenKind::Dedent} + level -= 4 + + # Delete the newline token in the beginning. + # + # If the file has indentations after it, they are now represented by separate + # indent tokens and parsing will fail. If the file doesn't have any blank/comment + # lines in the beginning, it has a newline token anyway to avoid special casing. + assert(tokens[0].kind == TokenKind::Newline) + memmove(&tokens[0], &tokens[1], sizeof tokens[0] * (ntokens - 1)) + + return tokens + +def tokenize(path: byte*) -> Token*: + file = fopen(path, "rb") + if file == NULL: + # TODO: test this + # TODO: include errno in the message + fail(Location{path=path}, "cannot open file") + + raw_tokens = tokenize_without_indent_dedent_tokens(file, path) + better_tokens = handle_indentations(raw_tokens) + free(raw_tokens) + return better_tokens + +def print_tokens(tokens: Token*) -> void: + printf("===== Tokens for file \"%s\" =====\n", tokens->location.path) + t = tokens + current_lineno = -1 + + while True: + if t->location.lineno != current_lineno: + current_lineno = t->location.lineno + printf("\nLine %d:\n", current_lineno) + + printf(" ") + print_token(t) + + if t->kind == TokenKind::EndOfFile: + break + t++ + + printf("\n") + +def main(argc: int, argv: byte**) -> int: + assert(argc == 2) + tokens = tokenize(argv[1]) + print_tokens(tokens) + free(tokens) + return 0 diff --git a/self_hosted/tokenizes_wrong.txt b/self_hosted/tokenizes_wrong.txt new file mode 100644 index 00000000..b54eac84 --- /dev/null +++ b/self_hosted/tokenizes_wrong.txt @@ -0,0 +1,186 @@ +# This is a list of files that are not yet supported by the tokenizer of the self-hosted compiler. +examples/fib.jou +examples/x11_window.jou +tests/syntax_error/import_after_def.jou +tests/syntax_error/missing_return_type.jou +tests/syntax_error/missing_arg_type.jou +tests/syntax_error/declare_global_with_value.jou +tests/syntax_error/arg_after_dotdotdot.jou +tests/syntax_error/chained_eq.jou +tests/syntax_error/bad_type.jou +tests/syntax_error/missing_import_keyword.jou +tests/syntax_error/missing_second_equal_sign.jou +tests/syntax_error/hex.jou +tests/syntax_error/double_assignment.jou +tests/syntax_error/double_with_letters_after.jou +tests/syntax_error/dot_after_e.jou +tests/syntax_error/unnecessary_zero.jou +tests/syntax_error/bad_addressof.jou +tests/syntax_error/bin.jou +tests/syntax_error/arg_default.jou +tests/syntax_error/def_missing_args.jou +tests/syntax_error/bad_expression.jou +tests/syntax_error/missing_field_names.jou +tests/syntax_error/bad_field.jou +tests/syntax_error/string_zero_byte.jou +tests/syntax_error/triple_equals.jou +tests/syntax_error/import_missing_quotes.jou +tests/syntax_error/unknown_escape_ascii.jou +tests/syntax_error/bad_struct_field_name.jou +tests/syntax_error/python_style_for.jou +tests/syntax_error/import1.jou +tests/syntax_error/and_or_chaining.jou +tests/syntax_error/bad_function_name_after_def.jou +tests/syntax_error/empty_char.jou +tests/syntax_error/import_missing_comma_with_parens.jou +tests/syntax_error/2bad.jou +tests/syntax_error/array_size.jou +tests/syntax_error/multidot_float.jou +tests/syntax_error/0b2.jou +tests/syntax_error/ee.jou +tests/syntax_error/overlong_char.jou +tests/syntax_error/dotdotdot_dotdotdot.jou +tests/syntax_error/bad_byte.jou +tests/syntax_error/first_line_indent.jou +tests/syntax_error/too_many_closing_parens.jou +tests/syntax_error/indentation_not4.jou +tests/syntax_error/import_missing_dot.jou +tests/syntax_error/unknown_escape_multibyte.jou +tests/syntax_error/infinite_c_style_for.jou +tests/syntax_error/struct_missing_type.jou +tests/syntax_error/bad_toplevel_declaration.jou +tests/syntax_error/missing_indentation.jou +tests/syntax_error/import_missing_comma.jou +tests/syntax_error/bad_argument_name.jou +tests/syntax_error/mismatched_close_brace.jou +tests/syntax_error/bad_statement.jou +tests/syntax_error/indexing.jou +tests/syntax_error/struct_init_js_syntax.jou +tests/syntax_error/missing_colon.jou +tests/syntax_error/missing_number_after_eminus.jou +tests/syntax_error/bad_struct_name.jou +tests/syntax_error/missing_number_after_e.jou +tests/syntax_error/chained_le.jou +tests/syntax_error/float.jou +tests/syntax_error/double_not.jou +tests/syntax_error/too_many_opening_parens.jou +tests/syntax_error/struct_default.jou +tests/crash/null_deref.jou +tests/wrong_type/assign_void.jou +tests/wrong_type/arg.jou +tests/wrong_type/assign_with_type.jou +tests/wrong_type/while.jou +tests/wrong_type/var_assignment.jou +tests/wrong_type/cannot_be_indexed.jou +tests/wrong_type/float_and_double.jou +tests/wrong_type/struct_member_assign.jou +tests/wrong_type/deref_non_pointer.jou +tests/wrong_type/elif.jou +tests/wrong_type/int_to_enum.jou +tests/wrong_type/arrow_operator_not_struct.jou +tests/wrong_type/enum_member_from_struct.jou +tests/wrong_type/enum_to_int.jou +tests/wrong_type/assign_to_deref_non_pointer.jou +tests/wrong_type/index.jou +tests/wrong_type/dot_operator.jou +tests/wrong_type/neg.jou +tests/wrong_type/array_to_ptr.jou +tests/wrong_type/arg_with_varargs.jou +tests/wrong_type/inplace_add_doesnt_go_back.jou +tests/wrong_type/mod.jou +tests/wrong_type/pointer_assignment.jou +tests/wrong_type/for.jou +tests/wrong_type/bool_main.jou +tests/wrong_type/plusplus.jou +tests/wrong_type/not.jou +tests/wrong_type/brace_init_arg.jou +tests/wrong_type/return_value.jou +tests/wrong_type/if.jou +tests/wrong_type/struct_member_init.jou +tests/wrong_type/void_main.jou +tests/wrong_type/array_vararg.jou +tests/wrong_type/pointer_eq.jou +tests/wrong_type/or.jou +tests/wrong_type/arrow_operator_not_pointer.jou +tests/should_succeed/assign.jou +tests/should_succeed/enum.jou +tests/should_succeed/string_syntax.jou +tests/should_succeed/octalnuber.jou +tests/should_succeed/sizeof.jou +tests/should_succeed/global_bug.jou +tests/should_succeed/crlf.jou +tests/should_succeed/loops.jou +tests/should_succeed/add_sub_mul_div_mod.jou +tests/should_succeed/printf.jou +tests/should_succeed/undefined_value_warning.jou +tests/should_succeed/global.jou +tests/should_succeed/pointer.jou +tests/should_succeed/mathlibtest.jou +tests/should_succeed/sscanf.jou +tests/should_succeed/plusplus_minusminus.jou +tests/should_succeed/stderr.jou +tests/should_succeed/return_void.jou +tests/should_succeed/unreachable_warning.jou +tests/should_succeed/local_import.jou +tests/should_succeed/array.jou +tests/should_succeed/compare.jou +tests/should_succeed/and_or_not.jou +tests/should_succeed/struct.jou +tests/should_succeed/argument.jou +tests/should_succeed/compiler_cli.jou +tests/should_succeed/file.jou +tests/should_succeed/expfloat.jou +tests/should_succeed/implicit_conversions.jou +tests/should_succeed/return_string.jou +tests/should_succeed/as.jou +tests/should_succeed/if_elif_else.jou +tests/should_succeed/unused_import.jou +tests/other_errors/missing_return.jou +tests/other_errors/brace_init_dupe.jou +tests/other_errors/double_plusplus.jou +tests/other_errors/array0.jou +tests/other_errors/address_of_minusminus.jou +tests/other_errors/function_wrong_n_args.jou +tests/other_errors/duplicate_enum_member.jou +tests/other_errors/continue_outside_loop.jou +tests/other_errors/varargs_def.jou +tests/other_errors/runtime_return_1.jou +tests/other_errors/dumb_assignment.jou +tests/other_errors/dynamic_array_length.jou +tests/other_errors/var_shadow.jou +tests/other_errors/address_of_array_indexing.jou +tests/other_errors/redefine_imported_func.jou +tests/other_errors/struct_already_exists.jou +tests/other_errors/imported_error.jou +tests/other_errors/break_outside_loop.jou +tests/other_errors/void_as_type.jou +tests/other_errors/dumb_assignment_with_plusequals.jou +tests/other_errors/using_void_function.jou +tests/other_errors/immediate_member_assign.jou +tests/other_errors/unexpected_return_value.jou +tests/other_errors/duplicate_arg_name.jou +tests/other_errors/missing_value_in_return.jou +tests/other_errors/duplicate_field_name.jou +tests/404/enum.jou +tests/404/function.jou +tests/404/var.jou +tests/404/type.jou +tests/404/import_wrong_func.jou +tests/404/struct_field.jou +tests/404/enum_member.jou +tests/404/import_symbol.jou +tests/404/var_addressof.jou +tests/404/file.jou +tests/404/import_symbol_multiline.jou +tests/already_exists_error/global_var.jou +tests/already_exists_error/struct_import.jou +tests/already_exists_error/global_var_import.jou +tests/already_exists_error/struct.jou +tests/already_exists_error/func.jou +tests/already_exists_error/local_var.jou +tests/already_exists_error/func_import.jou +tests/already_exists_error/struct_and_enum.jou +tests/too_long/long.jou +tests/too_long/nested_parentheses.jou +tests/too_long/name.jou +tests/too_long/int.jou diff --git a/src/jou_compiler.h b/src/jou_compiler.h index 69f99104..bbdadb15 100644 --- a/src/jou_compiler.h +++ b/src/jou_compiler.h @@ -49,6 +49,7 @@ typedef struct CfInstruction CfInstruction; struct CommandLineFlags { bool verbose; // Whether to print a LOT of debug info + bool tokenize_only; // If true, tokenize the file passed on command line and don't actually compile anything int optlevel; // Optimization level (0 don't optimize, 3 optimize a lot) const char *outfile; // If not NULL, where to output executable const char *linker_flags; // String that is appended to linking command diff --git a/src/main.c b/src/main.c index 1950d5f6..d5e385da 100644 --- a/src/main.c +++ b/src/main.c @@ -42,6 +42,7 @@ static const char help_fmt[] = " -o OUTFILE output an executable file, don't run the code\n" " -O0/-O1/-O2/-O3 set optimization level (0 = default, 3 = runs fastest)\n" " --verbose display a lot of information about all compilation steps\n" + " --tokenize-only display only the output of the tokenizer, don't do anything else\n" " --linker-flags appended to the linker command, so you can use external libraries\n" ; @@ -76,6 +77,13 @@ static void parse_arguments(int argc, char **argv, CommandLineFlags *flags, cons } else if (!strcmp(argv[i], "--verbose")) { flags->verbose = true; i++; + } else if (!strcmp(argv[i], "--tokenize-only")) { + if (argc > 3) { + fprintf(stderr, "%s: --tokenize-only cannot be used together with other flags", argv[0]); + goto wrong_usage; + } + flags->tokenize_only = true; + i++; } else if (!strcmp(argv[i], "--linker-flags")) { if (flags->linker_flags) { fprintf(stderr, "%s: --linker-flags cannot be given multiple times", argv[0]); @@ -157,6 +165,18 @@ static struct FileState *find_file(const struct CompileState *compst, const char return NULL; } +static FILE *open_the_file(const char *path, const Location *import_location) +{ + FILE *f = fopen(path, "rb"); + if (!f) { + if (import_location) + fail_with_error(*import_location, "cannot import from \"%s\": %s", path, strerror(errno)); + else + fail_with_error((Location){.filename=path}, "cannot open file: %s", strerror(errno)); + } + return f; +} + static void parse_file(struct CompileState *compst, const char *filename, const Location *import_location) { if (find_file(compst, filename)) @@ -164,15 +184,10 @@ static void parse_file(struct CompileState *compst, const char *filename, const struct FileState fs = { .path = strdup(filename) }; - FILE *f = fopen(fs.path, "rb"); - if (!f) { - if (import_location) - fail_with_error(*import_location, "cannot import from \"%s\": %s", filename, strerror(errno)); - else - fail_with_error((Location){.filename=filename}, "cannot open file: %s", strerror(errno)); - } + FILE *f = open_the_file(fs.path, import_location); Token *tokens = tokenize(f, fs.path); fclose(f); + if(compst->flags.verbose) print_tokens(tokens); @@ -394,6 +409,15 @@ int main(int argc, char **argv) printf("Data layout: %s\n", get_target()->data_layout); } + if (compst.flags.tokenize_only) { + FILE *f = open_the_file(filename, NULL); + Token *tokens = tokenize(f, filename); + fclose(f); + print_tokens(tokens); + free_tokens(tokens); + return 0; + } + #ifdef _WIN32 char *startup_path = malloc(strlen(compst.stdlib_path) + 50); sprintf(startup_path, "%s/_windows_startup.jou", compst.stdlib_path); diff --git a/src/print.c b/src/print.c index 215fca53..4588c080 100644 --- a/src/print.c +++ b/src/print.c @@ -93,10 +93,10 @@ void print_token(const Token *token) printf("end of file\n"); break; case TOKEN_INDENT: - printf("more indentation (+4 spaces)\n"); + printf("indent (+4 spaces)\n"); break; case TOKEN_DEDENT: - printf("less indentation (-4 spaces)\n"); + printf("dedent (-4 spaces)\n"); break; case TOKEN_OPERATOR: printf("operator '%s'\n", token->data.operator); diff --git a/stdlib/io.jou b/stdlib/io.jou index 5a13461f..842e972b 100644 --- a/stdlib/io.jou +++ b/stdlib/io.jou @@ -60,6 +60,10 @@ declare fprintf(file: FILE *, pattern: byte*, ...) -> int declare fgetc(file: FILE*) -> int # see getchar() declare fscanf(file: FILE*, pattern: byte*, ...) -> int +# Ensure that output is actually written. It may remain buffered +# if this function isn't called. +declare fflush(file: FILE*) -> int + # Read a line of text from file into a string starting at the given # pointer. Reading stops at newline character, end of file, on error, # or when the resulting string (including the '\0') wouldn't fit @@ -68,5 +72,9 @@ declare fscanf(file: FILE*, pattern: byte*, ...) -> int # Return value: NULL on error, same as destination on success. declare fgets(destination: byte*, n: int, file: FILE*) -> byte* +# TODO: document +declare feof(file: FILE*) -> int +declare ferror(file: FILE*) -> int + # Move back to beginning of file. declare rewind(file: FILE*) -> void diff --git a/stdlib/mem.jou b/stdlib/mem.jou index 94250f58..bf303cfc 100644 --- a/stdlib/mem.jou +++ b/stdlib/mem.jou @@ -3,6 +3,10 @@ # Heap allocations # TODO: write a tutorial about using these and add a link declare malloc(size: long) -> void* +declare realloc(ptr: void*, size: long) -> void* declare free(ptr: void*) -> void +# TODO: explain what each of these does +declare memset(dest: void*, fill_byte: int, count: long) -> void* declare memcpy(dest: void*, source: void*, count: long) -> void* +declare memmove(dest: void*, source: void*, count: long) -> void* diff --git a/stdlib/str.jou b/stdlib/str.jou index bdbe565d..0fb56698 100644 --- a/stdlib/str.jou +++ b/stdlib/str.jou @@ -11,3 +11,12 @@ declare snprintf(dest: byte*, n: long, pattern: byte*, ...) -> int # Find a substring. Return a pointer to the occurrence in haystack, or NULL if not found. declare strstr(haystack: byte*, needle: byte*) -> byte* + +# Similar to strstr(), but searches for a single byte rather than a substring. +declare strchr(haystack: byte*, needle: byte) -> byte* + +# Calculate the length of a string in bytes. Note that strlen("รถ") == 2, for example. +declare strlen(s: byte*) -> long + +# Compare the strings. Return 0 for equal, or nonzero for not equal. +declare strcmp(s1: byte*, s2: byte*) -> int diff --git a/tests/should_succeed/compiler_cli.jou b/tests/should_succeed/compiler_cli.jou index 1dfe419b..328908af 100644 --- a/tests/should_succeed/compiler_cli.jou +++ b/tests/should_succeed/compiler_cli.jou @@ -30,6 +30,7 @@ def main() -> int: run_jou("lolwat.jou") # Output: compiler error in file "lolwat.jou": cannot open file: No such file or directory run_jou("--linker-flags") # Output: : there must be a string of flags after --linker-flags (try " --help") run_jou("--linker-flags x --linker-flags y") # Output: : --linker-flags cannot be given multiple times (try " --help") + run_jou("--tokenize-only -O1 examples/hello.jou") # Output: : --tokenize-only cannot be used together with other flags (try " --help") # Output: Usage: # Output: [-o OUTFILE] [-O0|-O1|-O2|-O3] [--verbose] [--linker-flags "..."] FILENAME @@ -40,6 +41,7 @@ def main() -> int: # Output: -o OUTFILE output an executable file, don't run the code # Output: -O0/-O1/-O2/-O3 set optimization level (0 = default, 3 = runs fastest) # Output: --verbose display a lot of information about all compilation steps + # Output: --tokenize-only display only the output of the tokenizer, don't do anything else # Output: --linker-flags appended to the linker command, so you can use external libraries run_jou("--help") diff --git a/tokenizers.sh b/tokenizers.sh new file mode 100755 index 00000000..86b14f5d --- /dev/null +++ b/tokenizers.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# +# There are two Jou compilers: one written in C and another written in Jou. +# They should be able to tokenize each Jou file in exactly the same way. +# If tokenizing a Jou file fails, both tokenizers should fail with the same error message. + +if [[ "$OS" =~ Windows ]]; then + dotexe=.exe +else + dotexe= +fi + +set -e + +rm -rf tmp/tokenizers +mkdir -vp tmp/tokenizers + +echo "Compiling the self-hosted compiler..." +./jou${dotexe} -O1 -o tmp/tokenizers/self_hosted${dotexe} self_hosted/tokenizer.jou + +for file in $(find examples tests -name '*.jou'); do + echo $file + (./jou${dotexe} --tokenize-only $file || true) &> tmp/tokenizers/compiler_written_in_c.txt + (tmp/tokenizers/self_hosted${dotexe} $file || true) &> tmp/tokenizers/self_hosted.txt + + if grep -qxF $file self_hosted/tokenizes_wrong.txt; then + # The file is skipped, so the two compilers should behave differently + if diff tmp/tokenizers/compiler_written_in_c.txt tmp/tokenizers/self_hosted.txt >/dev/null; then + echo " Error: Tokenizers behave the same even though the file is listed in self_hosted/tokenizes_wrong.txt." + echo " To fix this error, delete the \"$file\" line from self_hosted/tokenizes_wrong.txt." + exit 1 + else + echo " Tokenizers behave differently as expected (listed in self_hosted/tokenizes_wrong.txt)" + fi + else + if diff -u --color=always tmp/tokenizers/compiler_written_in_c.txt tmp/tokenizers/self_hosted.txt; then + echo " Tokenizers behave the same as expected" + else + echo " Error: Tokenizers behave differently when given \"$file\"." + echo " You can silence this error by adding \"$file\" to tmp/tokenizers/self_hosted.txt." + echo " Ideally the tokenizers would behave in the same way for all files, but we aren't there yet." + exit 1 + fi + fi +done + +echo "" +echo "" +echo "success :)"