Skip to content

Commit

Permalink
BuiltinByteString literals aren't UTF8 encoded.
Browse files Browse the repository at this point in the history
  • Loading branch information
Unisay committed Nov 13, 2024
1 parent c082e28 commit 5ff83c0
Show file tree
Hide file tree
Showing 22 changed files with 583 additions and 1,108 deletions.
1 change: 1 addition & 0 deletions .hlint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@
- ignore: {name: Use first, within: [UntypedPlutusCore.Evaluation.Machine.Cek]}
- ignore: {name: Redundant if, within: [PlutusLedgerApi.V1.Value, PlutusLedgerApi.V1.Data.Value]}
- ignore: {name: Replace case with maybe, within: [PlutusLedgerApi.V1.Value, PlutusLedgerApi.V1.Data.Value]}
- ignore: {name: Use bimap, within: [PlutusTx.Builtins.HasOpaque]}
52 changes: 39 additions & 13 deletions plutus-benchmark/bitwise/src/PlutusBenchmark/Ed25519/Compiled.hs
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
-- editorconfig-checker-disable-file
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeApplications #-}

module PlutusBenchmark.Ed25519.Compiled (
checkValidCompiled,
msgAsData,
signatureAsData,
pkAsData
module PlutusBenchmark.Ed25519.Compiled
( checkValidCompiled
, msgAsData
, signatureAsData
, pkAsData
) where

import PlutusBenchmark.Ed25519 (checkValid)
Expand All @@ -19,17 +19,43 @@ import PlutusTx.Plugin ()
import PlutusTx.Prelude
import PlutusTx.TH (compile)

checkValidCompiled :: CompiledCode (BuiltinData -> BuiltinData -> BuiltinData -> Bool)
checkValidCompiled = $$(compile [|| \signature msg pk -> checkValid (unsafeFromBuiltinData signature)
(unsafeFromBuiltinData msg)
(unsafeFromBuiltinData pk) ||])
checkValidCompiled
:: CompiledCode (BuiltinData -> BuiltinData -> BuiltinData -> Bool)
checkValidCompiled =
$$( compile
[||
\signature msg pk ->
checkValid
(unsafeFromBuiltinData signature)
(unsafeFromBuiltinData msg)
(unsafeFromBuiltinData pk)
||]
)

msgAsData :: CompiledCode BuiltinData
msgAsData = liftCodeDef (toBuiltinData ("hello world" :: BuiltinByteString))
msgAsData =
liftCodeDef
$ toBuiltinData @BuiltinByteString
"\x68\x65\x6C\x6C\x6F\x20\x77\x6F\x72\x6C\x64"

signatureAsData :: CompiledCode BuiltinData
signatureAsData =
$$(compile [|| toBuiltinData ("\NUL\147!x\173\167\209z`\t\243|\195$X$\233\166\234\NUL\134\152l\DC4\243\&4\217\NAK\152\180{$M\227R\214\218%\241\157\ENQ\SO\ENQ\t\152\140\171\240\200f\184\133\203\227z\163\NUL\185\155Y\139\178\249\STX" :: BuiltinByteString) ||])
$$( compile
[||
toBuiltinData @BuiltinByteString
"\xC0\x80\xC2\x93\x21\x78\xC2\xAD\xC2\xA7\xC3\x91\x7A\x60\x09\xC3\xB3\
\\x7C\xC3\x83\x24\x58\x24\xC3\xA9\xC2\xA6\xC3\xAA\xC0\x80\xC2\x86\xC2\
\\x98\x6C\x14\xC3\xB3\x34\xC3\x99\x15\xC2\x98\xC2\xB4\x7B\x24\x4D\xC3\
\\xA3\x52\xC3\x96\xC3\x9A\x25\xC3\xB1\xC2\x9D\x05\x0E\x05\x09\xC2\x98\
\\xC2\x8C\xC2\xAB\xC3\xB0\xC3\x88\x66\xC2\xB8\xC2\x85\xC3\x8B\xC3\xA3\
\\x7A\xC2\xA3\xC0\x80\xC2\xB9\xC2\x9B\x59\xC2\x8B\xC2\xB2\xC3\xB9\x02"
||]
)

pkAsData :: CompiledCode BuiltinData
pkAsData = liftCodeDef (toBuiltinData ("(:\255\251\129\&7-^w\253\145\vh\ESC\171r\189\223/\213Qzb\249\175$z\211q\195\DC1\198" :: BuiltinByteString))
pkAsData =
liftCodeDef
$ toBuiltinData @BuiltinByteString
"\x28\x3A\xC3\xBF\xC3\xBB\xC2\x81\x37\x2D\x5E\x77\xC3\xBD\xC2\x91\x0B\x68\
\\x1B\xC2\xAB\x72\xC2\xBD\xC3\x9F\x2F\xC3\x95\x51\x7A\x62\xC3\xB9\xC2\xAF\
\\x24\x7A\xC3\x93\x71\xC3\x83\x11\xC3\x86"
7 changes: 2 additions & 5 deletions plutus-core/cost-model/test/TH.hs
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,14 @@
restrictions.
-}

{-# LANGUAGE TemplateHaskell #-}

module TH (genTest)
where
module TH (genTest) where

import Data.Char (toUpper)
import Language.Haskell.TH

toUpper1 :: String -> String
toUpper1 [] = error "empty string in toUpper1"
toUpper1 (c:cs) = (toUpper c):cs
toUpper1 (c:cs) = toUpper c : cs

mkIterApp :: Exp -> [Exp] -> Exp
mkIterApp = foldl AppE
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
### Changed

- `BuiltinByteString` literals changed to avoid UTF8 encoding and now can represent bytes in the range 0-255 directly, e.g. `"\x00\x01\x02" :: BuiltinByteString` or `stringToBuiltinByteString "\0\42\255"`.

1 change: 1 addition & 0 deletions plutus-tx-plugin/plutus-tx-plugin.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ test-suite plutus-tx-plugin-tests
Budget.Spec
Budget.WithGHCOptimisations
Budget.WithoutGHCOptimisations
ByteStringLiterals.Spec
IntegerLiterals.NoStrict.NegativeLiterals.Spec
IntegerLiterals.NoStrict.NoNegativeLiterals.Spec
IntegerLiterals.Strict.NegativeLiterals.Spec
Expand Down
146 changes: 100 additions & 46 deletions plutus-tx-plugin/src/PlutusTx/Compiler/Expr.hs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ViewPatterns #-}

{-# OPTIONS_GHC -Wno-partial-type-signatures #-}

-- | Functions for compiling GHC Core expressions into Plutus Core terms.
Expand All @@ -24,6 +25,7 @@ import GHC.Core qualified as GHC
import GHC.Core.Class qualified as GHC
import GHC.Core.Multiplicity qualified as GHC
import GHC.Core.TyCo.Rep qualified as GHC
import GHC.Num.Integer qualified
import GHC.Plugins qualified as GHC
import GHC.Types.CostCentre qualified as GHC
import GHC.Types.Id.Make qualified as GHC
Expand Down Expand Up @@ -76,12 +78,13 @@ import Data.ByteString qualified as BS
import Data.Generics.Uniplate.Data (transform, universeBi)
import Data.List (elemIndex, isPrefixOf, isSuffixOf)
import Data.Map qualified as Map
import Data.Maybe
import Data.Maybe (mapMaybe)
import Data.Set qualified as Set
import Data.Text qualified as T
import Data.Text.Encoding qualified as TE
import Data.Traversable
import GHC.Num.Integer qualified
import Data.Traversable (for)
import Data.Word (Word8)


{- Note [System FC and System FW]
Haskell uses system FC, which includes type equalities and coercions.
Expand Down Expand Up @@ -136,23 +139,51 @@ compileLiteral = \case
-- do different things to the inner expression. This one assumes it's a literal, the other one keeps compiling
-- through it.

-- | Get the bytestring content of a string expression, if possible. Follows (Haskell) variable references!
stringExprContent :: GHC.CoreExpr -> Maybe BS.ByteString
stringExprContent = \case
GHC.Lit (GHC.LitString bs) -> Just bs
-- unpackCString# / unpackCStringUtf8# are just wrappers around a literal
GHC.Var n `GHC.App` expr
| let name = GHC.getName n
, name == GHC.unpackCStringName || name == GHC.unpackCStringUtf8Name ->
stringExprContent expr
data StringExprContentAs = AsBytes | AsText

-- | Get the bytestring content of a string expression, if possible.
-- Follows (Haskell) variable references!
stringExprContent :: StringExprContentAs -> GHC.CoreExpr -> Maybe BS.ByteString
stringExprContent contentAs coreExpr = case coreExpr of
GHC.Lit (GHC.LitString bytes) ->
Just bytes
GHC.Var isUnpackCString `GHC.App` GHC.Lit (GHC.LitString bytes)
| GHC.getName isUnpackCString == GHC.unpackCStringName ->
Just bytes
GHC.Var isUnpackCStringUtf8 `GHC.App` GHC.Lit (GHC.LitString bytes)
| GHC.getName isUnpackCStringUtf8 == GHC.unpackCStringUtf8Name ->
case contentAs of
AsText -> Just bytes
AsBytes ->
-- GHC stores bytestring literals UTF-8 encoded, decoding them at runtime.
-- In Plinth we decode such bytestrings in compile-time.
BS.pack <$> fromUtf8 (BS.unpack bytes)
-- See Note [unpackFoldrCString#]
GHC.Var build `GHC.App` _ `GHC.App` GHC.Lam _ (GHC.Var unpack `GHC.App` _ `GHC.App` expr)
| GHC.getName build == GHC.buildName && GHC.getName unpack == GHC.unpackCStringFoldrName -> stringExprContent expr
| GHC.getName build == GHC.buildName && GHC.getName unpack == GHC.unpackCStringFoldrName ->
stringExprContent contentAs expr
-- GHC helpfully generates an empty list for the empty string literal instead of a 'LitString'
GHC.Var nil `GHC.App` GHC.Type (GHC.tyConAppTyCon_maybe -> Just tc)
| nil == GHC.dataConWorkId GHC.nilDataCon, GHC.getName tc == GHC.charTyConName -> Just mempty
-- Chase variable references! GHC likes to lift string constants to variables, that is not good for us!
GHC.Var (GHC.maybeUnfoldingTemplate . GHC.realIdUnfolding -> Just unfolding) -> stringExprContent unfolding
| nil == GHC.dataConWorkId GHC.nilDataCon, GHC.getName tc == GHC.charTyConName ->
Just mempty
-- Chase variable references! GHC likes to lift string constants to variables,
-- that is not good for us!
GHC.Var (GHC.maybeUnfoldingTemplate . GHC.realIdUnfolding -> Just unfolding) ->
stringExprContent contentAs unfolding
_ -> Nothing

{- | Decoding that undoes GHC's UTF-8 encoding of bytestring literals:
This isn't a full UTF-8 decoder: it only decodes the subset of UTF-8 that
is expected to be found in bytestring literals: 0x00 - 0xFF
-}
fromUtf8 :: [Word8] -> Maybe [Word8]
fromUtf8 = \case
[] -> Just []
192 : 128 : rest -> (0x00 :) <$> fromUtf8 rest
194 : b : rest | b > 127 && b < 192 -> (b :) <$> fromUtf8 rest
195 : b : rest | b > 127 && b < 192 -> ((b + 64) :) <$> fromUtf8 rest
b : rest | b > 0 && b < 128 -> (b :) <$> fromUtf8 rest
_ -> Nothing

{- | Strip off irrelevant things when we're trying to match a particular pattern in the code. Mostly ticks.
Expand Down Expand Up @@ -699,13 +730,21 @@ compileExpr e = traceCompilation 2 ("Compiling expr:" GHC.<+> GHC.ppr e) $ do
_ -> throwPlain $ CompilationError "No info for Pair builtin"

-- TODO: Maybe share this to avoid repeated lookups. Probably cheap, though.
(stringTyName, sbsName) <- case (Map.lookup ''Builtins.BuiltinString nameInfo, Map.lookup 'Builtins.stringToBuiltinString nameInfo) of
(stringTyName, sbsName) <-
case
( Map.lookup ''Builtins.BuiltinString nameInfo
, Map.lookup 'Builtins.stringToBuiltinString nameInfo
) of
(Just t1, Just t2) -> pure (GHC.getName t1, GHC.getName t2)
_ -> throwPlain $ CompilationError "No info for String builtin"

(bsTyName, sbbsName) <- case (Map.lookup ''Builtins.BuiltinByteString nameInfo, Map.lookup 'Builtins.stringToBuiltinByteString nameInfo) of
(Just t1, Just t2) -> pure (GHC.getName t1, GHC.getName t2)
_ -> throwPlain $ CompilationError "No info for ByteString builtin"
(builtinByteStringTyName, sbbsName) <-
case
( Map.lookup ''Builtins.BuiltinByteString nameInfo
, Map.lookup 'Builtins.stringToBuiltinByteString nameInfo
) of
(Just t1, Just t2) -> pure (GHC.getName t1, GHC.getName t2)
_ -> throwPlain $ CompilationError "No info for ByteString builtin"

useToOpaqueName <- GHC.getName <$> getThing 'Builtins.useToOpaque
useFromOpaqueName <- GHC.getName <$> getThing 'Builtins.useFromOpaque
Expand All @@ -730,39 +769,54 @@ compileExpr e = traceCompilation 2 ("Compiling expr:" GHC.<+> GHC.ppr e) $ do
-- to know we're looking at fromString.
-- We can safely commit to this match as soon as we've seen fromString - we won't accept
-- any applications of fromString that aren't creating literals of our builtin types.
(strip -> GHC.Var (GHC.idDetails -> GHC.ClassOpId cls)) `GHC.App` GHC.Type ty `GHC.App` _ `GHC.App` content
(strip -> GHC.Var (GHC.idDetails -> GHC.ClassOpId cls))
`GHC.App` GHC.Type ty `GHC.App` _dict `GHC.App` content
| GHC.getName cls == GHC.isStringClassName ->
case GHC.tyConAppTyCon_maybe ty of
Just tc -> case stringExprContent (strip content) of
Just bs ->
if
| GHC.getName tc == bsTyName -> pure $ PIR.Constant annMayInline $ PLC.someValue bs
| GHC.getName tc == stringTyName -> case TE.decodeUtf8' bs of
Right t -> pure $ PIR.Constant annMayInline $ PLC.someValue t
Left err ->
throwPlain . CompilationError $
"Text literal with invalid UTF-8 content: " <> (T.pack $ show err)
| otherwise ->
throwSd UnsupportedError $
"Use of fromString on type other than builtin strings or bytestrings:" GHC.<+> GHC.ppr ty
Nothing ->
throwSd CompilationError $
"Use of fromString with inscrutable content:" GHC.<+> GHC.ppr content
case GHC.tyConAppTyCon_maybe ty of -- extract Type constructor without arguments
Just tc ->
if
| GHC.getName tc == builtinByteStringTyName ->
case stringExprContent AsBytes (strip content) of
Nothing ->
throwSd CompilationError $
"Use of fromString @BuiltinByteString with inscrutable content:"
GHC.<+> GHC.ppr content
Just bs ->
pure $ PIR.Constant annMayInline $ PLC.someValue bs
| GHC.getName tc == stringTyName ->
case stringExprContent AsText (strip content) of
Nothing ->
throwSd CompilationError $
"Use of fromString @BuiltinString with inscrutable content:"
GHC.<+> GHC.ppr content
Just bs ->
case TE.decodeUtf8' bs of
Right t -> pure $ PIR.Constant annMayInline $ PLC.someValue t
Left err ->
throwPlain . CompilationError $
"Text literal with invalid UTF-8 content: " <> T.pack (show err)
| otherwise ->
throwSd UnsupportedError $
"Use of fromString on type other than builtin strings or bytestrings:"
GHC.<+> GHC.ppr ty
Nothing ->
throwSd UnsupportedError $
"Use of fromString on type other than builtin strings or bytestrings:" GHC.<+> GHC.ppr ty
"Use of fromString on type other than builtin strings or bytestrings:"
GHC.<+> GHC.ppr ty

-- 'stringToBuiltinByteString' invocation
(strip -> GHC.Var n) `GHC.App` (strip -> stringExprContent -> Just bs)
(strip -> GHC.Var n) `GHC.App` (strip -> stringExprContent AsBytes -> Just bs)
| GHC.getName n == sbbsName ->
pure $ PIR.Constant annMayInline $ PLC.someValue bs
-- 'stringToBuiltinString' invocation
(strip -> GHC.Var n) `GHC.App` (strip -> stringExprContent -> Just bs) | GHC.getName n == sbsName ->
case TE.decodeUtf8' bs of
Right t -> pure $ PIR.Constant annMayInline $ PLC.someValue t
Left err ->
throwPlain $
CompilationError $
"Text literal with invalid UTF-8 content: " <> (T.pack $ show err)
(strip -> GHC.Var n) `GHC.App` (strip -> stringExprContent AsText -> Just bs)
| GHC.getName n == sbsName ->
case TE.decodeUtf8' bs of
Right t -> pure $ PIR.Constant annMayInline $ PLC.someValue t
Left err ->
throwPlain $
CompilationError $
"Text literal with invalid UTF-8 content: " <> (T.pack $ show err)
-- See Note [Literals]
GHC.Lit lit -> compileLiteral lit
-- These are all wrappers around string and char literals, but keeping them allows us to give better errors
Expand Down
Loading

0 comments on commit 5ff83c0

Please sign in to comment.