diff --git a/modules/parser/src/main/scala/playground/smithyql/parser/v2/scanner.scala b/modules/parser/src/main/scala/playground/smithyql/parser/v2/scanner.scala new file mode 100644 index 00000000..1cf4799d --- /dev/null +++ b/modules/parser/src/main/scala/playground/smithyql/parser/v2/scanner.scala @@ -0,0 +1,166 @@ +package playground.smithyql.parser.v2.scanner + +import cats.kernel.Eq +import cats.syntax.all.* + +case class Token( + kind: TokenKind, + text: String, +) { + def width: Int = text.length +} + +object Token { + implicit val eq: Eq[Token] = Eq.fromUniversalEquals +} + +sealed trait TokenKind extends Product with Serializable { + + def apply( + text: String + ): Token = Token(this, text) + +} + +object TokenKind { + case object KW_IMPORT extends TokenKind + case object DOT extends TokenKind + case object COMMA extends TokenKind + case object HASH extends TokenKind + case object LB extends TokenKind + case object RB extends TokenKind + case object LBR extends TokenKind + case object RBR extends TokenKind + case object EQ extends TokenKind + case object SPACE extends TokenKind + case object NEWLINE extends TokenKind + case object IDENT extends TokenKind + case object COMMENT extends TokenKind + case object Error extends TokenKind + + implicit val eq: Eq[TokenKind] = Eq.fromUniversalEquals +} + +object Scanner { + + /** Entrypoint to scanning text into tokens. + * + * Always produces an output that can be rendered back to the original text. + */ + def scan( + s: String + ): List[Token] = { + var remaining = s + var tokens = List.empty[Token] + def add( + tok: Token + ) = tokens ::= tok + + def readSimple( + token: Char, + tok: TokenKind, + ): PartialFunction[Char, Unit] = { case `token` => + add(tok(token.toString)) + remaining = remaining.tail + } + + def simpleTokens( + pairings: ( + Char, + TokenKind, + )* + ): PartialFunction[Char, Unit] = pairings + .map(readSimple.tupled) + .reduce(_ orElse _) + + val readOne: PartialFunction[Char, Unit] = simpleTokens( + '.' -> TokenKind.DOT, + ',' -> TokenKind.COMMA, + '#' -> TokenKind.HASH, + '[' -> TokenKind.LB, + ']' -> TokenKind.RB, + '{' -> TokenKind.LBR, + '}' -> TokenKind.RBR, + '=' -> TokenKind.EQ, + ).orElse { + case letter if letter.isLetter => + val (letters, rest) = remaining.span(ch => ch.isLetterOrDigit || ch == '_') + add(TokenKind.IDENT(letters)) + remaining = rest + } + + // split "whitespace" string into chains of contiguous newlines OR whitespace characters. + def whitespaceChains( + whitespace: String + ): List[Token] = { + val isNewline = (ch: Char) => ch == '\n' + + if (whitespace.isEmpty) + Nil + else if (isNewline(whitespace.head)) { + val (nl, rest) = whitespace.span(isNewline) + TokenKind.NEWLINE(nl) :: whitespaceChains(rest) + } else { + val (wsp, rest) = whitespace.span(!isNewline(_)) + TokenKind.SPACE(wsp) :: whitespaceChains(rest) + } + } + + def eatWhitespace( + ) = { + val (wsp, rest) = remaining.span(ch => ch.isWhitespace) + if (wsp.isEmpty()) + false + else { + whitespaceChains(wsp).foreach(add) + remaining = rest + + true + } + } + + def eatComments( + ) = + if (!remaining.startsWith("//")) + false + else { + while (remaining.startsWith("//")) { + val (comment, rest) = remaining.span(_ != '\n') + add(TokenKind.COMMENT(comment)) + remaining = rest + } + + true + } + + def eatErrors( + ) = { + // todo: bug: even if the next character starts a multi-char token, this will consider it an error. + // instead, we should rework "readOne" to consume arbitrary constant-length tokens, and also include the possibility that `rest` has comments or whitespace. + val (failures, rest) = remaining.span(!readOne.isDefinedAt(_)) + remaining = rest + if (failures.nonEmpty) { + add(TokenKind.Error(failures)) + true + } else + false + } + + while (remaining.nonEmpty) { + val last = remaining + + readOne.applyOrElse[Char, Any]( + remaining.head, + (_: Char) => + // nothing matched. Eat whitespace and see if the rest is an error + eatWhitespace() || eatComments() || eatErrors(), + ) + + if (remaining == last) + sys.error(s"no progress in the last run! remaining string: $remaining") + } + + tokens.reverse + } + +} diff --git a/modules/parser/src/test/scala/playground/smithyql/parser/v2/ScannerTests.scala b/modules/parser/src/test/scala/playground/smithyql/parser/v2/ScannerTests.scala new file mode 100644 index 00000000..a2cb582f --- /dev/null +++ b/modules/parser/src/test/scala/playground/smithyql/parser/v2/ScannerTests.scala @@ -0,0 +1,171 @@ +package playground.smithyql.parser.v2 + +import cats.effect.IO +import cats.implicits._ +import org.scalacheck.Arbitrary +import org.scalacheck.Gen +import playground.smithyql.parser.v2.scanner.Scanner +import playground.smithyql.parser.v2.scanner.Token +import playground.smithyql.parser.v2.scanner.TokenKind +import weaver._ +import weaver.scalacheck.Checkers + +import Scanner.scan + +object ScannerTests extends SimpleIOSuite with Checkers { + + def arbTests( + name: TestName + )( + withArb: Arbitrary[String] => IO[Expectations] + ): Unit = { + + val sampleStringGen = Gen.oneOf( + Gen.alphaStr, + Gen.alphaNumStr, + Gen.asciiPrintableStr, + Gen.identifier, + Gen.oneOf(List(' ', '\n', '\t', '\r', '\f', '\b')).map(_.toString), + ) + + val arbString: Arbitrary[String] = Arbitrary { + Gen.listOf(sampleStringGen).map(_.mkString) + } + + test(name)(withArb(Arbitrary.arbString)) + test(name.copy(name = name.name + " (prepared input)"))(withArb(arbString)) + } + + arbTests("Any string input scans successfully") { implicit arbString => + forall { (s: String) => + scan(s): Unit + success + } + } + + arbTests("Scanning is lossless") { implicit arbString => + forall { (s: String) => + assert.eql(scan(s).foldMap(_.text), s) + } + } + + private def scanTest( + input: String, + explicitName: String = "", + )( + expected: List[Token] + ): Unit = + pureTest( + if (explicitName.nonEmpty) + explicitName + else + "Scan string: " + sanitize(input) + ) { + assert.eql(expected, scan(input)) + } + + private def sanitize( + text: String + ) = text.replace(" ", "·").replace("\n", "↵") + + scanTest("{")(List(TokenKind.LBR("{"))) + scanTest("}")(List(TokenKind.RBR("}"))) + scanTest("[")(List(TokenKind.LB("["))) + scanTest("]")(List(TokenKind.RB("]"))) + scanTest(".")(List(TokenKind.DOT("."))) + scanTest(",")(List(TokenKind.COMMA(","))) + scanTest("#")(List(TokenKind.HASH("#"))) + scanTest("=")(List(TokenKind.EQ("="))) + scanTest("a")(List(TokenKind.IDENT("a"))) + + // idents + scanTest("abcdef")(List(TokenKind.IDENT("abcdef"))) + + scanTest( + "hello_world" + )( + List( + TokenKind.IDENT("hello_world") + ) + ) + + scanTest( + "helloworld123" + )( + List( + TokenKind.IDENT("helloworld123") + ) + ) + + // whitespace + scanTest(" ")(List(TokenKind.SPACE(" "))) + scanTest("\n")(List(TokenKind.NEWLINE("\n"))) + + // contiguous whitespace of all kinds + // notably newlines are grouped together separately from other whitespace + scanTest(" \r \r \n\n")(List(TokenKind.SPACE(" \r \r "), TokenKind.NEWLINE("\n\n"))) + scanTest(" \n\n \n ")( + List( + TokenKind.SPACE(" "), + TokenKind.NEWLINE("\n\n"), + TokenKind.SPACE(" "), + TokenKind.NEWLINE("\n"), + TokenKind.SPACE(" "), + ) + ) + + // comments + scanTest("// hello 123 foo bar --")(List(TokenKind.COMMENT("// hello 123 foo bar --"))) + + scanTest( + explicitName = "Scan multiple line-comments", + input = + """//hello + |//world""".stripMargin, + )( + List( + TokenKind.COMMENT("//hello"), + TokenKind.NEWLINE("\n"), + TokenKind.COMMENT("//world"), + ) + ) + + scanTest( + "hello world //this is a comment" + )( + List( + TokenKind.IDENT("hello"), + TokenKind.SPACE(" "), + TokenKind.IDENT("world"), + TokenKind.SPACE(" "), + TokenKind.COMMENT("//this is a comment"), + ) + ) + + // errors + + scanTest( + explicitName = "Error tokens for input that doesn't match any other token", + input = "🤷*%$^@-+?", + )(List(TokenKind.Error("🤷*%$^@-+?"))) + + scanTest( + explicitName = "Error tokens mixed between other tokens", + input = "hello@world-this?is=an