Skip to content

Commit

Permalink
#9 Support astral characters in UTF-8
Browse files Browse the repository at this point in the history
  • Loading branch information
KarolS committed Oct 18, 2019
1 parent 3a6790e commit f5b6d99
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 26 deletions.
2 changes: 1 addition & 1 deletion docs/lang/text.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@

* `vectrex` – built-in Vectrex font

* `utf8` – UTF-8 (BMP only)
* `utf8` – UTF-8

* `utf16be`, `utf16le` – UTF-16BE and UTF-16LE

Expand Down
8 changes: 4 additions & 4 deletions src/main/scala/millfork/parser/MfParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
if (zt) {
log.error("Zero-terminated encoding is not a valid encoding for a character literal", Some(p))
}
co.encode(options.log, Some(p), c.toList, options, lenient = lenient) match {
co.encode(options.log, Some(p), c.codePoints().toArray.toList, options, lenient = lenient) match {
case List(value) =>
LiteralExpression(value, 1)
case _ =>
Expand All @@ -87,7 +87,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri

val textLiteral: P[List[Expression]] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map {
case (p, s, ((co, zt), lenient)) =>
val characters = co.encode(options.log, None, s, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
val characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p))
if (zt) characters ++ co.stringTerminator.map(nul => LiteralExpression(nul, 1))
else characters
}
Expand Down Expand Up @@ -184,7 +184,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri
optSlice <- ("," ~/ HWS ~/ literalAtom ~/ HWS ~/ "," ~/ HWS ~/ literalAtom ~/ HWS ~/ Pass).?
_ <- ")" ~/ Pass
} yield {
val data = Files.readAllBytes(Paths.get(currentDirectory, filePath.mkString))
val data = Files.readAllBytes(Paths.get(currentDirectory, filePath))
val slice = optSlice.fold(data) {
case (start, length) => data.slice(start.value.toInt, start.value.toInt + length.value.toInt)
}
Expand Down Expand Up @@ -613,7 +613,7 @@ object MfParser {

val identifier: P[String] = P((letter ~ lettersOrDigits).map { case (a, b) => a + b }).opaque("<identifier>")

val doubleQuotedString: P[List[Char]] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"").map(_.toList)
val doubleQuotedString: P[String] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"")

def size(value: Long, wordLiteral: Boolean, int24Literal: Boolean, int32Literal: Boolean): Int = {
val w = value > 255 || value < -0x80 || wordLiteral
Expand Down
48 changes: 27 additions & 21 deletions src/main/scala/millfork/parser/TextCodec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ sealed trait TextCodec {

def stringTerminator: List[Int]

def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int]
def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int]

def decode(by: Int): Char

Expand Down Expand Up @@ -55,19 +55,19 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
}
if (escSeq.length > 1 && (escSeq(0) == 'U' || escSeq(0) == 'u')) {
try {
return encode(log, position, Character.toChars(Integer.parseInt(escSeq.tail, 16)).toList, options, lenient)
return encode(log, position, List(Integer.parseInt(escSeq.tail, 16)), options, lenient)
} catch {
case _: NumberFormatException =>
}
}
if (escSeq == "program_name_upper") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient)
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient)
}
if (escSeq == "program_name") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient)
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient)
}
if (escSeq == "copyright_year") {
return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient)
return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient)
}
if (escSeq == "null" || escSeq == "nullchar") {
return stringTerminator
Expand All @@ -85,9 +85,10 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
}
}

override def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = {
override def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = {
val LBRACE = '{'.toInt
s match {
case '{' :: tail =>
case LBRACE :: tail =>
val (escSeq, closingBrace) = tail.span(_ != '}')
closingBrace match {
case '}' :: xs =>
Expand All @@ -97,7 +98,7 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override
Nil
}
case head :: tail =>
head.toString.getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient)
Character.toChars(head).mkString("").getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient)
case Nil => Nil
}
}
Expand All @@ -118,8 +119,8 @@ class TableTextCodec(override val name: String,

override val stringTerminator: List[Int] = List(stringTerminatorChar)

private def isPrintable(c: Char) = {
c.getType match {
private def isPrintable(c: Int) = {
Character.getType(c) match {
case Character.LOWERCASE_LETTER => true
case Character.UPPERCASE_LETTER => true
case Character.TITLECASE_LETTER => true
Expand Down Expand Up @@ -148,15 +149,16 @@ class TableTextCodec(override val name: String,
}
}

private def format(c:Char):String = {
private def format(c:Int):String = {
val u = f"U+${c.toInt}%04X"
if (isPrintable(c)) f"`$c%c` ($u%s)"
if (isPrintable(c)) f"`${Character.toChars(c).mkString}%s` ($u%s)"
else u
}

private def format(s:String) = {
val u = s.map(c => f"U+${c.toInt}%04X").mkString(",")
if (s.forall(isPrintable)) f"`$s%s` ($u%s)"
val codePoints = s.codePoints().toArray
val u = codePoints.map(c => f"U+${c}%04X").mkString(",")
if (codePoints.forall(isPrintable)) f"`$s%s` ($u%s)"
else u
}
private def encodeChar(log: Logger, position: Option[Position], c: Char, options: CompilationOptions, lenient: Boolean): Option[List[Int]] = {
Expand All @@ -177,10 +179,11 @@ class TableTextCodec(override val name: String,
}


def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = {
def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = {
val LBRACE = '{'.toInt
val lenient = options.flag(CompilationFlag.LenientTextEncoding)
s match {
case '{' :: tail =>
case LBRACE :: tail =>
val (escSeq, closingBrace) = tail.span(_ != '}')
closingBrace match {
case '}' :: xs =>
Expand All @@ -189,13 +192,16 @@ class TableTextCodec(override val name: String,
log.error(f"Unclosed escape sequence", position)
Nil
}
case head :: tail =>
(encodeChar(log, position, head, options, lenient) match {
case head :: tail if head >= Char.MinValue && head <= Char.MaxValue =>
(encodeChar(log, position, head.toChar, options, lenient) match {
case Some(x) => x
case None =>
log.error(f"Invalid character ${format(head)} in string", position)
Nil
}) ++ encode(log, position, tail, options, lenient)
case head :: tail =>
log.error(f"Invalid character ${format(head)} in string", position)
encode(log, position, tail, options, lenient)
case Nil => Nil
}
}
Expand All @@ -209,13 +215,13 @@ class TableTextCodec(override val name: String,
}
}
if (escSeq == "program_name_upper") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient)
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient)
}
if (escSeq == "program_name") {
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient)
return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient)
}
if (escSeq == "copyright_year") {
return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient)
return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient)
}
if (escSeq == "null" || escSeq == "nullchar") {
return stringTerminator
Expand Down
6 changes: 6 additions & 0 deletions src/test/scala/millfork/test/TextCodecSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ class TextCodecSuite extends FunSuite with Matchers {
| if p[1] != 0 { poke($bff8, 0) }
| if p[2] != 0 { poke($bff7, 0) }
| if p[3] != 0 { poke($bff6, 0) }
| p = "𓀀"utf8z
| if p[0] == 0 { poke($bff3, p[0]) }
| if p[1] == 0 { poke($bff2, p[1]) }
| if p[2] == 0 { poke($bff1, p[2]) }
| if p[3] == 0 { poke($bff0, p[3]) }
| if p[4] != 0 { poke($bfef, p[4]) }
| }
| macro asm void poke(word const addr, byte a) {
| STA addr
Expand Down

0 comments on commit f5b6d99

Please sign in to comment.