From f5b6d9999c886875352bf750e77d595019284495 Mon Sep 17 00:00:00 2001 From: Karol Stasiak Date: Fri, 18 Oct 2019 11:01:31 +0200 Subject: [PATCH] #9 Support astral characters in UTF-8 --- docs/lang/text.md | 2 +- src/main/scala/millfork/parser/MfParser.scala | 8 ++-- .../scala/millfork/parser/TextCodec.scala | 48 +++++++++++-------- .../scala/millfork/test/TextCodecSuite.scala | 6 +++ 4 files changed, 38 insertions(+), 26 deletions(-) diff --git a/docs/lang/text.md b/docs/lang/text.md index 04bcbcbc..371ed714 100644 --- a/docs/lang/text.md +++ b/docs/lang/text.md @@ -55,7 +55,7 @@ * `vectrex` – built-in Vectrex font -* `utf8` – UTF-8 (BMP only) +* `utf8` – UTF-8 * `utf16be`, `utf16le` – UTF-16BE and UTF-16LE diff --git a/src/main/scala/millfork/parser/MfParser.scala b/src/main/scala/millfork/parser/MfParser.scala index f76c13d1..8becf726 100644 --- a/src/main/scala/millfork/parser/MfParser.scala +++ b/src/main/scala/millfork/parser/MfParser.scala @@ -68,7 +68,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri if (zt) { log.error("Zero-terminated encoding is not a valid encoding for a character literal", Some(p)) } - co.encode(options.log, Some(p), c.toList, options, lenient = lenient) match { + co.encode(options.log, Some(p), c.codePoints().toArray.toList, options, lenient = lenient) match { case List(value) => LiteralExpression(value, 1) case _ => @@ -87,7 +87,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri val textLiteral: P[List[Expression]] = P(position() ~ doubleQuotedString ~/ HWS ~ codec).map { case (p, s, ((co, zt), lenient)) => - val characters = co.encode(options.log, None, s, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p)) + val characters = co.encode(options.log, None, s.codePoints().toArray.toList, options, lenient = lenient).map(c => LiteralExpression(c, 1).pos(p)) if (zt) characters ++ co.stringTerminator.map(nul => LiteralExpression(nul, 1)) else characters } @@ -184,7 +184,7 @@ abstract class MfParser[T](fileId: String, input: String, currentDirectory: Stri optSlice <- ("," ~/ HWS ~/ literalAtom ~/ HWS ~/ "," ~/ HWS ~/ literalAtom ~/ HWS ~/ Pass).? _ <- ")" ~/ Pass } yield { - val data = Files.readAllBytes(Paths.get(currentDirectory, filePath.mkString)) + val data = Files.readAllBytes(Paths.get(currentDirectory, filePath)) val slice = optSlice.fold(data) { case (start, length) => data.slice(start.value.toInt, start.value.toInt + length.value.toInt) } @@ -613,7 +613,7 @@ object MfParser { val identifier: P[String] = P((letter ~ lettersOrDigits).map { case (a, b) => a + b }).opaque("") - val doubleQuotedString: P[List[Char]] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"").map(_.toList) + val doubleQuotedString: P[String] = P("\"" ~/ CharsWhile(c => c != '\"' && c != '\n' && c != '\r').?.! ~ "\"") def size(value: Long, wordLiteral: Boolean, int24Literal: Boolean, int32Literal: Boolean): Int = { val w = value > 255 || value < -0x80 || wordLiteral diff --git a/src/main/scala/millfork/parser/TextCodec.scala b/src/main/scala/millfork/parser/TextCodec.scala index deb9a919..f69137ce 100644 --- a/src/main/scala/millfork/parser/TextCodec.scala +++ b/src/main/scala/millfork/parser/TextCodec.scala @@ -17,7 +17,7 @@ sealed trait TextCodec { def stringTerminator: List[Int] - def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] + def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] def decode(by: Int): Char @@ -55,19 +55,19 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override } if (escSeq.length > 1 && (escSeq(0) == 'U' || escSeq(0) == 'u')) { try { - return encode(log, position, Character.toChars(Integer.parseInt(escSeq.tail, 16)).toList, options, lenient) + return encode(log, position, List(Integer.parseInt(escSeq.tail, 16)), options, lenient) } catch { case _: NumberFormatException => } } if (escSeq == "program_name_upper") { - return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient) + return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient) } if (escSeq == "program_name") { - return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient) + return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient) } if (escSeq == "copyright_year") { - return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient) + return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient) } if (escSeq == "null" || escSeq == "nullchar") { return stringTerminator @@ -85,9 +85,10 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override } } - override def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = { + override def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = { + val LBRACE = '{'.toInt s match { - case '{' :: tail => + case LBRACE :: tail => val (escSeq, closingBrace) = tail.span(_ != '}') closingBrace match { case '}' :: xs => @@ -97,7 +98,7 @@ class UnicodeTextCodec(override val name: String, val charset: Charset, override Nil } case head :: tail => - head.toString.getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient) + Character.toChars(head).mkString("").getBytes(charset).map(_.&(0xff)).toList ++ encode(log, position, tail, options, lenient) case Nil => Nil } } @@ -118,8 +119,8 @@ class TableTextCodec(override val name: String, override val stringTerminator: List[Int] = List(stringTerminatorChar) - private def isPrintable(c: Char) = { - c.getType match { + private def isPrintable(c: Int) = { + Character.getType(c) match { case Character.LOWERCASE_LETTER => true case Character.UPPERCASE_LETTER => true case Character.TITLECASE_LETTER => true @@ -148,15 +149,16 @@ class TableTextCodec(override val name: String, } } - private def format(c:Char):String = { + private def format(c:Int):String = { val u = f"U+${c.toInt}%04X" - if (isPrintable(c)) f"`$c%c` ($u%s)" + if (isPrintable(c)) f"`${Character.toChars(c).mkString}%s` ($u%s)" else u } private def format(s:String) = { - val u = s.map(c => f"U+${c.toInt}%04X").mkString(",") - if (s.forall(isPrintable)) f"`$s%s` ($u%s)" + val codePoints = s.codePoints().toArray + val u = codePoints.map(c => f"U+${c}%04X").mkString(",") + if (codePoints.forall(isPrintable)) f"`$s%s` ($u%s)" else u } private def encodeChar(log: Logger, position: Option[Position], c: Char, options: CompilationOptions, lenient: Boolean): Option[List[Int]] = { @@ -177,10 +179,11 @@ class TableTextCodec(override val name: String, } - def encode(log: Logger, position: Option[Position], s: List[Char], options: CompilationOptions, lenient: Boolean): List[Int] = { + def encode(log: Logger, position: Option[Position], s: List[Int], options: CompilationOptions, lenient: Boolean): List[Int] = { + val LBRACE = '{'.toInt val lenient = options.flag(CompilationFlag.LenientTextEncoding) s match { - case '{' :: tail => + case LBRACE :: tail => val (escSeq, closingBrace) = tail.span(_ != '}') closingBrace match { case '}' :: xs => @@ -189,13 +192,16 @@ class TableTextCodec(override val name: String, log.error(f"Unclosed escape sequence", position) Nil } - case head :: tail => - (encodeChar(log, position, head, options, lenient) match { + case head :: tail if head >= Char.MinValue && head <= Char.MaxValue => + (encodeChar(log, position, head.toChar, options, lenient) match { case Some(x) => x case None => log.error(f"Invalid character ${format(head)} in string", position) Nil }) ++ encode(log, position, tail, options, lenient) + case head :: tail => + log.error(f"Invalid character ${format(head)} in string", position) + encode(log, position, tail, options, lenient) case Nil => Nil } } @@ -209,13 +215,13 @@ class TableTextCodec(override val name: String, } } if (escSeq == "program_name_upper") { - return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).toList, options, lenient) + return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toUpperCase(Locale.ROOT).codePoints().toArray.toList, options, lenient) } if (escSeq == "program_name") { - return encode(log, position, options.outputFileName.getOrElse("MILLFORK").toList, options, lenient) + return encode(log, position, options.outputFileName.getOrElse("MILLFORK").codePoints().toArray.toList, options, lenient) } if (escSeq == "copyright_year") { - return encode(log, position, LocalDate.now.getYear.toString.toList, options, lenient) + return encode(log, position, LocalDate.now.getYear.toString.map(_.toInt).toList, options, lenient) } if (escSeq == "null" || escSeq == "nullchar") { return stringTerminator diff --git a/src/test/scala/millfork/test/TextCodecSuite.scala b/src/test/scala/millfork/test/TextCodecSuite.scala index 5b922a35..59e2f72a 100644 --- a/src/test/scala/millfork/test/TextCodecSuite.scala +++ b/src/test/scala/millfork/test/TextCodecSuite.scala @@ -55,6 +55,12 @@ class TextCodecSuite extends FunSuite with Matchers { | if p[1] != 0 { poke($bff8, 0) } | if p[2] != 0 { poke($bff7, 0) } | if p[3] != 0 { poke($bff6, 0) } + | p = "𓀀"utf8z + | if p[0] == 0 { poke($bff3, p[0]) } + | if p[1] == 0 { poke($bff2, p[1]) } + | if p[2] == 0 { poke($bff1, p[2]) } + | if p[3] == 0 { poke($bff0, p[3]) } + | if p[4] != 0 { poke($bfef, p[4]) } | } | macro asm void poke(word const addr, byte a) { | STA addr