Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #86 #87

Merged
merged 2 commits into from
Sep 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions unicode-data-scripts/bench/Main.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import Control.DeepSeq (NFData, deepseq, force)
import Control.Exception (evaluate)
import Data.Ix (Ix(..))
import Test.Tasty.Bench
(Benchmark, bgroup, bench, defaultMain, env, nf)

Expand All @@ -10,22 +11,38 @@ main :: IO ()
main = defaultMain
[ bgroup "Unicode.Char.General.Script"
[ bgroup "script"
[ benchNF "unicode-data" (show . S.script)
[ benchChars "unicode-data" (show . S.script)
]
, bgroup "scriptDefinition"
[ benchNF "unicode-data" (show . S.scriptDefinition)
]
-- [TODO] scriptDefinition, inScript
]
]
where
benchNF :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
benchNF t f =
benchChars :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
benchChars t f =
-- Avoid side-effects with garbage collection (see tasty-bench doc)
env
(evaluate (force chars)) -- initialize
(bench t . nf (fold_ f)) -- benchmark
(bench t . nf (foldString f)) -- benchmark
where
-- Filter out: Surrogates, Private Use Areas and unsassigned code points
chars = filter isValid [minBound..maxBound]
isValid c = G.generalCategory c < G.Surrogate

fold_ :: forall a. (NFData a) => (Char -> a) -> String -> ()
fold_ f = foldr (deepseq . f) ()
foldString :: forall a. (NFData a) => (Char -> a) -> String -> ()
foldString f = foldr (deepseq . f) ()

benchNF
:: forall a b. (Bounded a, Ix a, NFData b)
=> String
-> (a -> b)
-> Benchmark
benchNF t f = bench t (nf (fold_ f) (minBound, maxBound))

fold_
:: forall a b. (Ix a, NFData b)
=> (a -> b)
-> (a, a)
-> ()
fold_ f = foldr (deepseq . f) () . range
7 changes: 0 additions & 7 deletions unicode-data-scripts/lib/Unicode/Char/General/Scripts.hs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ module Unicode.Char.General.Scripts
( S.Script(..)
, script
, scriptDefinition
, inScript
)
where

Expand All @@ -39,12 +38,6 @@ import qualified Unicode.Internal.Char.Scripts as S
script :: Char -> S.Script
script = toEnum . S.script

-- [TODO] @since
-- | Check if a character is in a 'S.Script'.
{-# INLINE inScript #-}
inScript :: S.Script -> Char -> Bool
inScript s = (== s) . script

{- HLINT ignore scriptDefinition "Eta reduce" -}
-- [TODO] @since
-- | Characters correspinding to a 'S.Script'.
Expand Down
66 changes: 58 additions & 8 deletions unicode-data-scripts/test/Unicode/Char/General/ScriptsSpec.hs
Original file line number Diff line number Diff line change
@@ -1,12 +1,26 @@
{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE BlockArguments, CPP #-}

module Unicode.Char.General.ScriptsSpec
( spec
) where

import qualified Unicode.Char.General.Scripts as UScripts
import Data.Foldable (traverse_)
import Test.Hspec
import qualified Unicode.Char.General.Scripts as UScripts
import qualified Unicode.Internal.Char.Scripts as S

import GHC.Exts
(Ptr(..), Char(..), Int(..),
indexWord32OffAddr#, int2Word#,
and#, isTrue#, eqWord#, leWord#, neWord#,
andI#, (-#), (<#),
ord#)
#if MIN_VERSION_base(4,16,0)
import GHC.Exts (word32ToWord#)
#endif
#ifdef WORDS_BIGENDIAN
import GHC.Exts (byteSwap32#)
#endif

{- [NOTE]
These tests may fail if the compiler’s Unicode version
Expand All @@ -26,14 +40,11 @@ does not match the version of this package.
spec :: Spec
spec = do
describe "Unicode scripts" do
it "inScript"
let check s = if all (UScripts.inScript s) (UScripts.scriptDefinition s)
then pure ()
else expectationFailure (show s)
in traverse_ check [minBound..maxBound]
it "Characters are in the definition of their corresponding script"
let {
check c = let s = UScripts.script c in if UScripts.inScript s c
check c =
let s = UScripts.script c
in if s `inScript` c
then pure ()
else expectationFailure $ mconcat
[ "Char “", show c, "” in not in the definition of “"
Expand All @@ -49,3 +60,42 @@ spec = do
check s = let chars = UScripts.scriptDefinition s
in traverse_ (checkChar s) chars
} in traverse_ check [minBound..maxBound]

{- HLINT ignore inScript "Eta reduce" -}
-- Check if a character is in a 'S.Script'.
-- This is faster than testing the string from UScripts.scriptDefinition
inScript :: S.Script -> Char -> Bool
inScript s (C# c#) = check (S.scriptDefinition s)
where
-- [NOTE] see 'scriptDefinition' for the description of the encoding.

scriptRangeMask# = 0x80000000## -- 1 << 31
maskComplement# = 0x7fffffff## -- 1 << 31 ^ 0xffffffff
cp# = int2Word# (ord# c#)

check (Ptr addr#, I# n#) = let {
getRawCodePoint k# =
#ifdef WORDS_BIGENDIAN
#if MIN_VERSION_base(4,16,0)
byteSwap32# (word32ToWord# (indexWord32OffAddr# addr# k#));
#else
byteSwap32# (indexWord32OffAddr# addr# k#);
#endif
#elif MIN_VERSION_base(4,16,0)
word32ToWord# (indexWord32OffAddr# addr# k#);
#else
indexWord32OffAddr# addr# k#;
#endif
getCodePoint k# = and# maskComplement# k#;
find k# = not (isTrue# (k# <# 0#)) &&
let {
r1# = getRawCodePoint k#;
c1# = getCodePoint r1#;
isRange = isTrue# (and# r1# scriptRangeMask# `neWord#` 0##)
} in if isRange
then let {
c2# = getCodePoint (getRawCodePoint (k# -# 1#));
found = isTrue# ((c2# `leWord#` cp#) `andI#` (cp# `leWord#` c1#))
} in found || find (k# -# 2#)
else isTrue# (c1# `eqWord#` cp#) || find (k# -# 1#)
} in find (n# -# 1#)
103 changes: 60 additions & 43 deletions unicode-data/bench/Main.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import Control.DeepSeq (NFData, deepseq, force)
import Control.Exception (evaluate)
import Data.Ix (Ix(..))
import Test.Tasty.Bench
(Benchmark, bgroup, bench, bcompare, defaultMain, env, nf)

Expand All @@ -24,22 +25,22 @@ main :: IO ()
main = defaultMain
[ bgroup "Unicode.Char.Case"
[ bgroup "isLowerCase"
[ benchNF "unicode-data" C.isLowerCase
[ benchChars "unicode-data" C.isLowerCase
]
, bgroup "isUpperCase"
[ benchNF "unicode-data" C.isUpperCase
[ benchChars "unicode-data" C.isUpperCase
]
, bgroup "toCaseFoldString"
[ benchNF "unicode-data" C.toCaseFoldString
[ benchChars "unicode-data" C.toCaseFoldString
]
, bgroup "toLowerString"
[ benchNF "unicode-data" C.toLowerString
[ benchChars "unicode-data" C.toLowerString
]
, bgroup "toTitleString"
[ benchNF "unicode-data" C.toTitleString
[ benchChars "unicode-data" C.toTitleString
]
, bgroup "toUpperString"
[ benchNF "unicode-data" C.toUpperString
[ benchChars "unicode-data" C.toUpperString
]
]
, bgroup "Unicode.Char.Case.Compat"
Expand Down Expand Up @@ -71,7 +72,7 @@ main = defaultMain
, Bench "unicode-data" (show . G.generalCategory)
]
, bgroup "isAlphabetic"
[ benchNF "unicode-data" G.isAlphabetic
[ benchChars "unicode-data" G.isAlphabetic
]
, bgroup' "isAlphaNum"
[ Bench "base" Char.isAlphaNum
Expand Down Expand Up @@ -102,35 +103,37 @@ main = defaultMain
, Bench "unicode-data" G.isSymbol
]
, bgroup "isWhiteSpace"
[ benchNF "unicode-data" G.isWhiteSpace
[ benchChars "unicode-data" G.isWhiteSpace
]
-- Korean Hangul Characters
, bgroup "isHangul"
[ benchNF "unicode-data" G.isHangul
[ benchChars "unicode-data" G.isHangul
]
, bgroup "isHangulLV"
[ benchNF "unicode-data" G.isHangul
[ benchChars "unicode-data" G.isHangul
]
, bgroup "isJamo"
[ benchNF "unicode-data" G.isJamo
[ benchChars "unicode-data" G.isJamo
]
, bgroup "jamoLIndex"
[ benchNF "unicode-data" G.jamoLIndex
[ benchChars "unicode-data" G.jamoLIndex
]
, bgroup "jamoVIndex"
[ benchNF "unicode-data" G.jamoVIndex
[ benchChars "unicode-data" G.jamoVIndex
]
, bgroup "jamoTIndex"
[ benchNF "unicode-data" G.jamoTIndex
[ benchChars "unicode-data" G.jamoTIndex
]
]
, bgroup "Unicode.Char.General.Blocks"
[ bgroup "block"
[ benchNF "unicode-data" (show . B.block)
[ benchChars "unicode-data" (show . B.block)
]
, bgroup "blockDefinition"
[ benchNF "unicode-data" (show . B.blockDefinition)
]
-- [TODO] blockDefinition, inBlock
, bgroup "allBlockRanges"
[ benchNF "unicode-data" (const B.allBlockRanges)
[ benchChars "unicode-data" (const B.allBlockRanges)
]
]
, bgroup "Unicode.Char.General.Compat"
Expand All @@ -149,66 +152,66 @@ main = defaultMain
]
, bgroup "Unicode.Char.Identifiers"
[ bgroup "isIDContinue"
[ benchNF "unicode-data" I.isIDContinue
[ benchChars "unicode-data" I.isIDContinue
]
, bgroup "isIDStart"
[ benchNF "unicode-data" I.isIDStart
[ benchChars "unicode-data" I.isIDStart
]
, bgroup "isXIDContinue"
[ benchNF "unicode-data" I.isXIDContinue
[ benchChars "unicode-data" I.isXIDContinue
]
, bgroup "isXIDStart"
[ benchNF "unicode-data" I.isXIDStart
[ benchChars "unicode-data" I.isXIDStart
]
, bgroup "isPatternSyntax"
[ benchNF "unicode-data" I.isPatternSyntax
[ benchChars "unicode-data" I.isPatternSyntax
]
, bgroup "isPatternWhitespace"
[ benchNF "unicode-data" I.isPatternWhitespace
[ benchChars "unicode-data" I.isPatternWhitespace
]
]
, bgroup "Unicode.Char.Normalization"
[ bgroup "isCombining"
[ benchNF "unicode-data" N.isCombining
[ benchChars "unicode-data" N.isCombining
]
, bgroup "combiningClass"
[ benchNF "unicode-data" N.combiningClass
[ benchChars "unicode-data" N.combiningClass
]
, bgroup "isCombiningStarter"
[ benchNF "unicode-data" N.isCombiningStarter
[ benchChars "unicode-data" N.isCombiningStarter
]
-- [TODO] compose, composeStarters
, bgroup "isDecomposable"
[ bgroup "Canonical"
[ benchNF "unicode-data" (N.isDecomposable N.Canonical)
[ benchChars "unicode-data" (N.isDecomposable N.Canonical)
]
, bgroup "Kompat"
[ benchNF "unicode-data" (N.isDecomposable N.Kompat)
[ benchChars "unicode-data" (N.isDecomposable N.Kompat)
]
]
-- [FIXME] Fail due to non-exhaustive pattern matching
-- , bgroup "decompose"
-- [ bgroup "Canonical"
-- [ benchNF "unicode-data" (N.decompose N.Canonical)
-- [ benchChars "unicode-data" (N.decompose N.Canonical)
-- ]
-- , bgroup "Kompat"
-- [ benchNF "unicode-data" (N.decompose N.Kompat)
-- [ benchChars "unicode-data" (N.decompose N.Kompat)
-- ]
-- ]
, bgroup "decomposeHangul"
[ benchNF "unicode-data" N.decomposeHangul
[ benchChars "unicode-data" N.decomposeHangul
]
]
, bgroup "Unicode.Char.Numeric"
-- [TODO] Replace with 'isNumber' once the migration is done.
[ bgroup "isNumeric"
[ benchNF "unicode-data" Num.isNumeric
[ benchChars "unicode-data" Num.isNumeric
]
, bgroup "numericValue"
[ benchNF "unicode-data" Num.numericValue
[ benchChars "unicode-data" Num.numericValue
]
, bgroup "integerValue"
[ benchNF "unicode-data" Num.integerValue
[ benchChars "unicode-data" Num.integerValue
]
]
, bgroup "Unicode.Char.Numeric.Compat"
Expand All @@ -220,26 +223,40 @@ main = defaultMain
]
where
bgroup' groupTitle bs = bgroup groupTitle
[ benchNF' groupTitle title f
[ benchChars' groupTitle title f
| Bench title f <- bs
]

-- [NOTE] Works if groupTitle uniquely identifies the benchmark group.
benchNF' groupTitle title = case title of
"base" -> benchNF title
benchChars' groupTitle title = case title of
"base" -> benchChars title
_ -> bcompare ("$NF == \"base\" && $(NF-1) == \"" ++ groupTitle ++ "\"")
. benchNF title
. benchChars title

benchNF :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
benchNF t f =
benchChars :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
benchChars t f =
-- Avoid side-effects with garbage collection (see tasty-bench doc)
env
(evaluate (force chars)) -- initialize
(bench t . nf (fold_ f)) -- benchmark
(bench t . nf (foldString f)) -- benchmark
where
-- Filter out: Surrogates, Private Use Areas and unsassigned code points
chars = filter isValid [minBound..maxBound]
isValid c = G.generalCategory c < G.Surrogate

fold_ :: forall a. (NFData a) => (Char -> a) -> String -> ()
fold_ f = foldr (deepseq . f) ()
foldString :: forall a. (NFData a) => (Char -> a) -> String -> ()
foldString f = foldr (deepseq . f) ()

benchNF
:: forall a b. (Bounded a, Ix a, NFData b)
=> String
-> (a -> b)
-> Benchmark
benchNF t f = bench t (nf (fold_ f) (minBound, maxBound))

fold_
:: forall a b. (Ix a, NFData b)
=> (a -> b)
-> (a, a)
-> ()
fold_ f = foldr (deepseq . f) () . range
Loading