Skip to content

Commit

Permalink
Refactor parsing from compiler to parser
Browse files Browse the repository at this point in the history
  • Loading branch information
john-z-yang committed Dec 16, 2023
1 parent eac3de0 commit c64a842
Show file tree
Hide file tree
Showing 8 changed files with 298 additions and 240 deletions.
2 changes: 1 addition & 1 deletion makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ To set it to the /lib folder of this project.
endef

_DEPS = code/Code.hpp \
compile/Compiler.hpp \
compile/Compiler.hpp compile/Parser.hpp \
error/RuntimeError.hpp error/SyntaxError.hpp error/TypeError.hpp \
fn/CPPFnImpls.hpp \
repl/repl.hpp \
Expand Down
222 changes: 18 additions & 204 deletions src/compile/Compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include "../error/SyntaxError.hpp"
#include "../runtime/VM.hpp"
#include "../sexpr/Casting.hpp"
#include "../sexpr/String.hpp"
#include "Grammar.hpp"
#include "SrcLoc.hpp"
#include <optional>
Expand All @@ -17,178 +16,22 @@ using namespace compile;
using namespace runtime;
using namespace error;

std::vector<Token> Compiler::tokenize(std::vector<std::string> lines) {
std::vector<Token> tokens;
for (unsigned int row{1}; const auto &line : lines) {
auto newTokens = tokenize(line, row);
tokens.insert(tokens.cend(), newTokens.cbegin(), newTokens.cend());
++row;
}
return tokens;
}

std::vector<Token>
Compiler::tokenize(std::string line, const unsigned int row) {
std::vector<Token> tokens;
std::regex rgx(
"\\\"(?:[^\"\\\\]*(?:\\\\.)?)*\\\"|;|\\(|\\)|,@|,|`|'|[^\\s(),@,`']+"
);
auto begin = std::sregex_iterator(line.cbegin(), line.cend(), rgx);
auto end = std::sregex_iterator();
for (std::sregex_iterator i = begin; i != end; ++i) {
std::smatch match = *i;
tokens.push_back(Token{
match.str(),
{
row,
(unsigned int)match.position(),
}
});
}
return tokens;
}

bool Compiler::isNum(const std::string s) {
try {
std::stod(s);
} catch (...) {
return false;
}
return true;
}

const SExprs *Compiler::parse() {
auto tokens = tokenize(source);
auto it = tokens.cbegin();
return cast<SExprs>(parseLists(it, tokens.cend()));
}

const SExpr *Compiler::parseLists(TokenIter &it, const TokenIter &end) {
if (it == end) {
return vm.heap.alloc<Nil>();
}
const auto [row, col] = it->srcLoc;
const auto cur = parseList(it, end);
const auto sexprs = vm.heap.alloc<SExprs>(cur, parseLists(it, end));
srcMap[sexprs] = {row, col};
return sexprs;
}

const SExpr *Compiler::parseList(TokenIter &it, const TokenIter &end) {
auto token = *it;
it += 1;
if (token.str == "(") {
const auto sExprs = parseElem(it, end);
srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}});
return sExprs;
}
if (token.str == "'" || token.str == "`" || token.str == "," ||
token.str == ",@") {
const auto rest =
vm.heap.alloc<SExprs>(parseList(it, end), vm.heap.alloc<Nil>());
srcMap.insert({rest, {token.srcLoc.row, token.srcLoc.col}});
const auto sExprs = vm.heap.alloc<SExprs>(parseAtom(token), rest);
srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}});
return sExprs;
}
const auto atom = parseAtom(token);
return atom;
}

const SExpr *Compiler::parseElem(TokenIter &it, const TokenIter &end) {
auto token = *it;
if (token.str == ")") {
it += 1;
return vm.heap.alloc<Nil>();
} else if (token.str == "(") {
it += 1;
const auto first = parseElem(it, end);
const auto rest = parseElem(it, end);
const auto sExprs = vm.heap.alloc<SExprs>(first, rest);
srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}});
return sExprs;
}
return parseSexprs(it, end);
}

const SExpr *Compiler::parseSexprs(TokenIter &it, const TokenIter &end) {
auto token = *it;
const auto first = parseList(it, end);
if (it->str == ".") {
it += 1;
const auto rest = parseList(it, end);
if (it == end) {
handleTypeError(dotGrammer, "datum", rest);
}
it += 1;
const auto sExprs = vm.heap.alloc<SExprs>(first, rest);
srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}});
return sExprs;
}
const auto rest = parseElem(it, end);
const auto sExprs = vm.heap.alloc<SExprs>(first, rest);
srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}});
return sExprs;
}

const SExpr *Compiler::parseAtom(Token token) {
if (isNum(token.str)) {
return vm.heap.alloc<Num>(std::stod(token.str));
}
if (token.str.front() == '\"' && token.str.back() == '\"') {
return vm.heap.alloc<String>(token.str);
}
if (token.str == "#<undefined>") {
return vm.heap.alloc<Undefined>();
}
if (token.str == "#t") {
return vm.heap.alloc<Bool>(true);
}
if (token.str == "#f") {
return vm.heap.alloc<Bool>(false);
}
if (token.str == "'") {
return vm.heap.alloc<Sym>("quote");
}
if (token.str == "`") {
return vm.heap.alloc<Sym>("quasiquote");
}
if (token.str == ",") {
return vm.heap.alloc<Sym>("unquote");
}
if (token.str == ",@") {
return vm.heap.alloc<Sym>("unquote-splicing");
}
return vm.heap.alloc<Sym>(token.str);
}

void Compiler::handleUnexpectedToken(
const Token &token, const std::string &line
) {
std::stringstream ss;
ss << "Unexpected \"" << token.str << "\".";
throw SyntaxError(ss.str(), line, token.srcLoc.row, token.srcLoc.col);
}

Compiler::Compiler(
const std::vector<std::string> source,
SrcMap sourceLoc,
const SExpr *param,
const SExprs *body,
ParsedSrc &parsedSrc,
const sexpr::SExpr *param,
const sexpr::SExprs *body,
Compiler &enclosing,
VM &vm
runtime::VM &vm
)
: vm(vm),
enclosing(enclosing),
source(source),
srcMap(sourceLoc),
curSrcLoc({srcMap[param].row, srcMap[param].col}),
parsedSrc(parsedSrc),
curSrcLoc({parsedSrc.srcMap[param].row, parsedSrc.srcMap[param].col}),
param(param),
body(body),
arity(countArity()),
variadic(isVariadic()),
body(body),
stackOffset(1) {

if (const auto sExprs = dynCast<SExprs>(param)) {
visitEach(sExprs.value(), [this](const auto sExpr) {
const auto sym = cast<Sym>(sExpr);
Expand All @@ -206,7 +49,7 @@ Compiler::Compiler(
}

void Compiler::updateCurSrcLoc(const sexpr::SExprs *sExpr) {
curSrcLoc = srcMap[sExpr];
curSrcLoc = parsedSrc.srcMap[sExpr];
}

std::optional<const std::size_t> Compiler::resolveLocal(const Sym *sym) {
Expand Down Expand Up @@ -365,7 +208,7 @@ void Compiler::compileAtom(const Atom *atom) {
if (isa<Nil>(atom)) {
throw error::SyntaxError(
"Expected a non-empty list.",
source[curSrcLoc.row - 1],
parsedSrc.source[curSrcLoc.row - 1],
curSrcLoc.row,
curSrcLoc.col
);
Expand Down Expand Up @@ -403,12 +246,7 @@ void Compiler::emitLambda(const MatchedSExpr<sexpr::SExpr> matched) {
}

Compiler compiler(
source,
srcMap,
lambdaParam.get(),
cast<SExprs>(lambdaBody.get()),
*this,
vm
parsedSrc, lambdaParam.get(), cast<SExprs>(lambdaBody.get()), *this, vm
);
const auto function = compiler.compile();

Expand Down Expand Up @@ -467,7 +305,7 @@ void Compiler::execDefMacro(const MatchedSExpr<sexpr::SExpr> matched) {
const auto [row, col] = curSrcLoc;
throw error::SyntaxError(
"Invalid syntax for define-macro: must define macros in top level",
source[row - 1],
parsedSrc.source[row - 1],
row,
col
);
Expand All @@ -477,12 +315,7 @@ void Compiler::execDefMacro(const MatchedSExpr<sexpr::SExpr> matched) {
unpackPartial<Sym, SExpr>(matched.get());

Compiler compiler(
source,
srcMap,
macroArgNames.get(),
cast<SExprs>(macroBody.get()),
*this,
vm
parsedSrc, macroArgNames.get(), cast<SExprs>(macroBody.get()), *this, vm
);
const auto function = compiler.compile();

Expand Down Expand Up @@ -585,7 +418,7 @@ const SExpr *Compiler::execMacro(const SExpr *sExpr) {
const auto res = vm.eval();

traverse(res, [this](const auto &sExpr) {
srcMap.insert({sExpr, curSrcLoc});
parsedSrc.srcMap.insert({sExpr, curSrcLoc});
});

return res;
Expand All @@ -596,7 +429,7 @@ void Compiler::handleInvalidDef() {
throw error::SyntaxError(
"Invalid syntax for define: cannot use define as an "
"expression",
source[row - 1],
parsedSrc.source[row - 1],
row,
col
);
Expand All @@ -609,18 +442,18 @@ void Compiler::handleTypeError(
ss << "Invalid syntax for " << grammar << "." << std::endl
<< "Expected " << expected << ", but got " << actual << ".";
const auto [row, col] = curSrcLoc;
throw SyntaxError(ss.str(), source[row - 1], row, col);
throw SyntaxError(ss.str(), parsedSrc.source[row - 1], row, col);
}

Compiler::Compiler(std::vector<std::string> source, VM &vm)
Compiler::Compiler(runtime::VM &vm, ParsedSrc &parsedSrc)
: vm(vm),
gcGuard(vm.heap.pauseGC()),
source(source),
parsedSrc(parsedSrc),
curSrcLoc({1, 0}),
param(vm.heap.alloc<Nil>()),
body(parsedSrc.root),
arity(0),
variadic(false),
body(parse()),
stackOffset(1) {}

const Prototype *Compiler::compile() {
Expand All @@ -633,22 +466,3 @@ const Prototype *Compiler::compile() {

return vm.heap.alloc<Prototype>(upValues.size(), arity, variadic, code);
}

void Compiler::verifyLex(
const std::string &line,
const unsigned int curSrcLoc,
unsigned int &openParen,
unsigned int &closedParen
) {
auto tokens = tokenize(line, curSrcLoc);
for (auto it = tokens.cbegin(); it != tokens.cend(); ++it) {
if (openParen == closedParen && it->str == ")") {
handleUnexpectedToken(*it, line);
}
if (it->str == "(") {
openParen += 1;
} else if (it->str == ")") {
closedParen += 1;
}
}
}
Loading

0 comments on commit c64a842

Please sign in to comment.