Skip to content

Commit

Permalink
Add encodeLE/decodeLE, fixing #19
Browse files Browse the repository at this point in the history
  • Loading branch information
hasufell committed Jun 9, 2024
1 parent 4390b95 commit d153ebb
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 7 deletions.
4 changes: 4 additions & 0 deletions System/OsString.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ module System.OsString
, unsafeEncodeUtf
, encodeWith
, encodeFS
, encodeLE
, osstr
, empty
, singleton
Expand All @@ -33,6 +34,7 @@ module System.OsString
, decodeUtf
, decodeWith
, decodeFS
, decodeLE
, unpack

-- * Word types
Expand Down Expand Up @@ -137,13 +139,15 @@ import System.OsString.Internal
, unsafeEncodeUtf
, encodeWith
, encodeFS
, encodeLE
, osstr
, pack
, empty
, singleton
, decodeUtf
, decodeWith
, decodeFS
, decodeLE
, unpack
, snoc
, cons
Expand Down
52 changes: 49 additions & 3 deletions System/OsString/Common.hs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ module System.OsString.MODULE_NAME
, unsafeEncodeUtf
, encodeWith
, encodeFS
, encodeLE
, fromBytes
, pstr
, singleton
Expand All @@ -43,6 +44,7 @@ module System.OsString.MODULE_NAME
, decodeUtf
, decodeWith
, decodeFS
, decodeLE
, unpack

-- * Word construction
Expand Down Expand Up @@ -242,14 +244,14 @@ encodeWith enc str = unsafePerformIO $ do

#ifdef WINDOWS_DOC
-- | This mimics the behavior of the base library when doing filesystem
-- operations, which does permissive UTF-16 encoding, where coding errors generate
-- operations (usually filepaths), which does permissive UTF-16 encoding, where coding errors generate
-- Chars in the surrogate range.
--
-- The reason this is in IO is because it unifies with the Posix counterpart,
-- which does require IO. This is safe to 'unsafePerformIO'/'unsafeDupablePerformIO'.
#else
-- | This mimics the behavior of the base library when doing filesystem
-- operations, which uses shady PEP 383 style encoding (based on the current locale,
-- operations (usually filepaths), which uses shady PEP 383 style encoding (based on the current locale,
-- but PEP 383 only works properly on UTF-8 encodings, so good luck).
--
-- Looking up the locale requires IO. If you're not worried about calls
Expand All @@ -263,6 +265,28 @@ encodeFS = fmap WindowsString . encodeWithBaseWindows
encodeFS = fmap PosixString . encodeWithBasePosix
#endif

#ifdef WINDOWS_DOC
-- | This mimics the behavior of the base library when doing filesystem
-- operations (usually filepaths), which does permissive UTF-16 encoding, where coding errors generate
-- Chars in the surrogate range.
--
-- The reason this is in IO is because it unifies with the Posix counterpart,
-- which does require IO. This is safe to 'unsafePerformIO'/'unsafeDupablePerformIO'.
#else
-- | This mimics the behavior of the base library when doing filesystem
-- operations (usually filepaths), which uses 'getLocaleEncoding'.
--
-- Looking up the locale requires IO. If you're not worried about calls
-- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure
-- to deeply evaluate the result to catch exceptions).
#endif
encodeLE :: String -> IO PLATFORM_STRING
#ifdef WINDOWS
encodeLE = fmap WindowsString . encodeWithBaseWindows
#else
encodeLE = fmap PosixString . encodeWithBasePosix'
#endif


#ifdef WINDOWS_DOC
-- | Partial unicode friendly decoding.
Expand Down Expand Up @@ -317,7 +341,29 @@ decodeWith unixEnc (PosixString ba) = unsafePerformIO $ do
-- which does require IO. 'unsafePerformIO'/'unsafeDupablePerformIO' are safe, however.
#else
-- | This mimics the behavior of the base library when doing filesystem
-- operations, which uses shady PEP 383 style encoding (based on the current locale,
-- operations, which uses 'getLocaleEncoding'.
--
-- Looking up the locale requires IO. If you're not worried about calls
-- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure
-- to deeply evaluate the result to catch exceptions).
#endif
decodeLE :: PLATFORM_STRING -> IO String
#ifdef WINDOWS
decodeLE (WindowsString ba) = decodeWithBaseWindows ba
#else
decodeLE (PosixString ba) = decodeWithBasePosix' ba
#endif

#ifdef WINDOWS_DOC
-- | Like 'decodeUtf', except this mimics the behavior of the base library when doing filesystem
-- operations (usually filepaths), which does permissive UTF-16 encoding, where coding errors generate
-- Chars in the surrogate range.
--
-- The reason this is in IO is because it unifies with the Posix counterpart,
-- which does require IO. 'unsafePerformIO'/'unsafeDupablePerformIO' are safe, however.
#else
-- | This mimics the behavior of the base library when doing filesystem
-- operations (usually filepaths), which uses shady PEP 383 style encoding (based on the current locale,
-- but PEP 383 only works properly on UTF-8 encodings, so good luck).
--
-- Looking up the locale requires IO. If you're not worried about calls
Expand Down
2 changes: 2 additions & 0 deletions System/OsString/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ module System.OsString.Encoding
-- * base encoding
, encodeWithBasePosix
, decodeWithBasePosix
, encodeWithBasePosix'
, decodeWithBasePosix'
, encodeWithBaseWindows
, decodeWithBaseWindows
)
Expand Down
24 changes: 21 additions & 3 deletions System/OsString/Encoding/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import Numeric (showHex)
import Foreign.C (CStringLen)
import Data.Char (chr)
import Foreign
import GHC.IO.Encoding (getFileSystemEncoding)
import GHC.IO.Encoding (getFileSystemEncoding, getLocaleEncoding)

-- -----------------------------------------------------------------------------
-- UCS-2 LE
Expand Down Expand Up @@ -270,9 +270,15 @@ peekWindowsString (cp, l) = do
withPosixString :: String -> (CStringLen -> IO a) -> IO a
withPosixString fp f = getFileSystemEncoding >>= \enc -> GHC.withCStringLen enc fp f

withPosixString' :: String -> (CStringLen -> IO a) -> IO a
withPosixString' fp f = getLocaleEncoding >>= \enc -> GHC.withCStringLen enc fp f

peekPosixString :: CStringLen -> IO String
peekPosixString fp = getFileSystemEncoding >>= \enc -> GHC.peekCStringLen enc fp

peekPosixString' :: CStringLen -> IO String
peekPosixString' fp = getLocaleEncoding >>= \enc -> GHC.peekCStringLen enc fp

-- | Decode with the given 'TextEncoding'.
decodeWithTE :: TextEncoding -> BS8.ShortByteString -> Either EncodingException String
decodeWithTE enc ba = unsafePerformIO $ do
Expand All @@ -289,18 +295,30 @@ encodeWithTE enc str = unsafePerformIO $ do
-- Encoders / decoders
--

-- | This mimics the filepath decoder base uses on unix,
-- | This mimics the filepath decoder base uses on unix (using PEP-383),
-- with the small distinction that we're not truncating at NUL bytes (because we're not at
-- the outer FFI layer).
decodeWithBasePosix :: BS8.ShortByteString -> IO String
decodeWithBasePosix ba = BS8.useAsCStringLen ba $ \fp -> peekPosixString fp

-- | This mimics the filepath dencoder base uses on unix,
-- | This mimics the string decoder base uses on unix,
-- with the small distinction that we're not truncating at NUL bytes (because we're not at
-- the outer FFI layer).
decodeWithBasePosix' :: BS8.ShortByteString -> IO String
decodeWithBasePosix' ba = BS8.useAsCStringLen ba $ \fp -> peekPosixString' fp

-- | This mimics the filepath encoder base uses on unix (using PEP-383),
-- with the small distinction that we're not truncating at NUL bytes (because we're not at
-- the outer FFI layer).
encodeWithBasePosix :: String -> IO BS8.ShortByteString
encodeWithBasePosix str = withPosixString str $ \cstr -> BS8.packCStringLen cstr

-- | This mimics the string encoder base uses on unix,
-- with the small distinction that we're not truncating at NUL bytes (because we're not at
-- the outer FFI layer).
encodeWithBasePosix' :: String -> IO BS8.ShortByteString
encodeWithBasePosix' str = withPosixString' str $ \cstr -> BS8.packCStringLen cstr

-- | This mimics the filepath decoder base uses on windows,
-- with the small distinction that we're not truncating at NUL bytes (because we're not at
-- the outer FFI layer).
Expand Down
30 changes: 29 additions & 1 deletion System/OsString/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ encodeWith unixEnc _ str = OsString <$> PF.encodeWith unixEnc str
#endif

-- | Like 'encodeUtf', except this mimics the behavior of the base library when doing filesystem
-- operations, which is:
-- operations (usually filepath), which is:
--
-- 1. on unix, uses shady PEP 383 style encoding (based on the current locale,
-- but PEP 383 only works properly on UTF-8 encodings, so good luck)
Expand All @@ -84,6 +84,20 @@ encodeWith unixEnc _ str = OsString <$> PF.encodeWith unixEnc str
encodeFS :: String -> IO OsString
encodeFS = fmap OsString . PF.encodeFS

-- | Like 'encodeUtf', except this mimics the behavior of the base library when doing string
-- operations (e.g. reading file contents), which is:
--
-- 1. on unix, uses shady PEP 383 style encoding (based on the current locale,
-- but PEP 383 only works properly on UTF-8 encodings, so good luck)
-- 2. on windows does permissive UTF-16 encoding, where coding errors generate
-- Chars in the surrogate range
--
-- Looking up the locale requires IO. If you're not worried about calls
-- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure
-- to deeply evaluate the result to catch exceptions).
encodeLE :: String -> IO OsString
encodeLE = fmap OsString . PF.encodeLE


-- | Partial unicode friendly decoding.
--
Expand Down Expand Up @@ -123,6 +137,20 @@ decodeWith unixEnc _ (OsString x) = PF.decodeWith unixEnc x
decodeFS :: OsString -> IO String
decodeFS (OsString x) = PF.decodeFS x

-- | Like 'decodeUtf', except this mimics the behavior of the base library when doing string operations
-- (e.g. reading file contents), which is:
--
-- 1. this mimics the behavior of the base library when doing filesystem
-- operations, which uses 'getLocaleEncoding'
-- 2. on windows does permissive UTF-16 encoding, where coding errors generate
-- Chars in the surrogate range
--
-- Looking up the locale requires IO. If you're not worried about calls
-- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure
-- to deeply evaluate the result to catch exceptions).
decodeLE :: OsString -> IO String
decodeLE (OsString x) = PF.decodeLE x


-- | Constructs an @OsString@ from a ByteString.
--
Expand Down

0 comments on commit d153ebb

Please sign in to comment.