From 5eea2bcc0c03b962df9c7b5c1d1abf738927475e Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Mon, 19 Sep 2022 12:12:36 +0200 Subject: [PATCH 1/2] Remove inBlock and add blockDefinition benchmark --- unicode-data/bench/Main.hs | 103 ++++++++++-------- .../lib/Unicode/Char/General/Blocks.hs | 7 -- unicode-data/test/Unicode/CharSpec.hs | 4 +- 3 files changed, 62 insertions(+), 52 deletions(-) diff --git a/unicode-data/bench/Main.hs b/unicode-data/bench/Main.hs index 441567a7..ef9030dd 100644 --- a/unicode-data/bench/Main.hs +++ b/unicode-data/bench/Main.hs @@ -1,5 +1,6 @@ import Control.DeepSeq (NFData, deepseq, force) import Control.Exception (evaluate) +import Data.Ix (Ix(..)) import Test.Tasty.Bench (Benchmark, bgroup, bench, bcompare, defaultMain, env, nf) @@ -24,22 +25,22 @@ main :: IO () main = defaultMain [ bgroup "Unicode.Char.Case" [ bgroup "isLowerCase" - [ benchNF "unicode-data" C.isLowerCase + [ benchChars "unicode-data" C.isLowerCase ] , bgroup "isUpperCase" - [ benchNF "unicode-data" C.isUpperCase + [ benchChars "unicode-data" C.isUpperCase ] , bgroup "toCaseFoldString" - [ benchNF "unicode-data" C.toCaseFoldString + [ benchChars "unicode-data" C.toCaseFoldString ] , bgroup "toLowerString" - [ benchNF "unicode-data" C.toLowerString + [ benchChars "unicode-data" C.toLowerString ] , bgroup "toTitleString" - [ benchNF "unicode-data" C.toTitleString + [ benchChars "unicode-data" C.toTitleString ] , bgroup "toUpperString" - [ benchNF "unicode-data" C.toUpperString + [ benchChars "unicode-data" C.toUpperString ] ] , bgroup "Unicode.Char.Case.Compat" @@ -71,7 +72,7 @@ main = defaultMain , Bench "unicode-data" (show . G.generalCategory) ] , bgroup "isAlphabetic" - [ benchNF "unicode-data" G.isAlphabetic + [ benchChars "unicode-data" G.isAlphabetic ] , bgroup' "isAlphaNum" [ Bench "base" Char.isAlphaNum @@ -102,35 +103,37 @@ main = defaultMain , Bench "unicode-data" G.isSymbol ] , bgroup "isWhiteSpace" - [ benchNF "unicode-data" G.isWhiteSpace + [ benchChars "unicode-data" G.isWhiteSpace ] -- Korean Hangul Characters , bgroup "isHangul" - [ benchNF "unicode-data" G.isHangul + [ benchChars "unicode-data" G.isHangul ] , bgroup "isHangulLV" - [ benchNF "unicode-data" G.isHangul + [ benchChars "unicode-data" G.isHangul ] , bgroup "isJamo" - [ benchNF "unicode-data" G.isJamo + [ benchChars "unicode-data" G.isJamo ] , bgroup "jamoLIndex" - [ benchNF "unicode-data" G.jamoLIndex + [ benchChars "unicode-data" G.jamoLIndex ] , bgroup "jamoVIndex" - [ benchNF "unicode-data" G.jamoVIndex + [ benchChars "unicode-data" G.jamoVIndex ] , bgroup "jamoTIndex" - [ benchNF "unicode-data" G.jamoTIndex + [ benchChars "unicode-data" G.jamoTIndex ] ] , bgroup "Unicode.Char.General.Blocks" [ bgroup "block" - [ benchNF "unicode-data" (show . B.block) + [ benchChars "unicode-data" (show . B.block) + ] + , bgroup "blockDefinition" + [ benchNF "unicode-data" (show . B.blockDefinition) ] - -- [TODO] blockDefinition, inBlock , bgroup "allBlockRanges" - [ benchNF "unicode-data" (const B.allBlockRanges) + [ benchChars "unicode-data" (const B.allBlockRanges) ] ] , bgroup "Unicode.Char.General.Compat" @@ -149,66 +152,66 @@ main = defaultMain ] , bgroup "Unicode.Char.Identifiers" [ bgroup "isIDContinue" - [ benchNF "unicode-data" I.isIDContinue + [ benchChars "unicode-data" I.isIDContinue ] , bgroup "isIDStart" - [ benchNF "unicode-data" I.isIDStart + [ benchChars "unicode-data" I.isIDStart ] , bgroup "isXIDContinue" - [ benchNF "unicode-data" I.isXIDContinue + [ benchChars "unicode-data" I.isXIDContinue ] , bgroup "isXIDStart" - [ benchNF "unicode-data" I.isXIDStart + [ benchChars "unicode-data" I.isXIDStart ] , bgroup "isPatternSyntax" - [ benchNF "unicode-data" I.isPatternSyntax + [ benchChars "unicode-data" I.isPatternSyntax ] , bgroup "isPatternWhitespace" - [ benchNF "unicode-data" I.isPatternWhitespace + [ benchChars "unicode-data" I.isPatternWhitespace ] ] , bgroup "Unicode.Char.Normalization" [ bgroup "isCombining" - [ benchNF "unicode-data" N.isCombining + [ benchChars "unicode-data" N.isCombining ] , bgroup "combiningClass" - [ benchNF "unicode-data" N.combiningClass + [ benchChars "unicode-data" N.combiningClass ] , bgroup "isCombiningStarter" - [ benchNF "unicode-data" N.isCombiningStarter + [ benchChars "unicode-data" N.isCombiningStarter ] -- [TODO] compose, composeStarters , bgroup "isDecomposable" [ bgroup "Canonical" - [ benchNF "unicode-data" (N.isDecomposable N.Canonical) + [ benchChars "unicode-data" (N.isDecomposable N.Canonical) ] , bgroup "Kompat" - [ benchNF "unicode-data" (N.isDecomposable N.Kompat) + [ benchChars "unicode-data" (N.isDecomposable N.Kompat) ] ] -- [FIXME] Fail due to non-exhaustive pattern matching -- , bgroup "decompose" -- [ bgroup "Canonical" - -- [ benchNF "unicode-data" (N.decompose N.Canonical) + -- [ benchChars "unicode-data" (N.decompose N.Canonical) -- ] -- , bgroup "Kompat" - -- [ benchNF "unicode-data" (N.decompose N.Kompat) + -- [ benchChars "unicode-data" (N.decompose N.Kompat) -- ] -- ] , bgroup "decomposeHangul" - [ benchNF "unicode-data" N.decomposeHangul + [ benchChars "unicode-data" N.decomposeHangul ] ] , bgroup "Unicode.Char.Numeric" -- [TODO] Replace with 'isNumber' once the migration is done. [ bgroup "isNumeric" - [ benchNF "unicode-data" Num.isNumeric + [ benchChars "unicode-data" Num.isNumeric ] , bgroup "numericValue" - [ benchNF "unicode-data" Num.numericValue + [ benchChars "unicode-data" Num.numericValue ] , bgroup "integerValue" - [ benchNF "unicode-data" Num.integerValue + [ benchChars "unicode-data" Num.integerValue ] ] , bgroup "Unicode.Char.Numeric.Compat" @@ -220,26 +223,40 @@ main = defaultMain ] where bgroup' groupTitle bs = bgroup groupTitle - [ benchNF' groupTitle title f + [ benchChars' groupTitle title f | Bench title f <- bs ] -- [NOTE] Works if groupTitle uniquely identifies the benchmark group. - benchNF' groupTitle title = case title of - "base" -> benchNF title + benchChars' groupTitle title = case title of + "base" -> benchChars title _ -> bcompare ("$NF == \"base\" && $(NF-1) == \"" ++ groupTitle ++ "\"") - . benchNF title + . benchChars title - benchNF :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark - benchNF t f = + benchChars :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark + benchChars t f = -- Avoid side-effects with garbage collection (see tasty-bench doc) env (evaluate (force chars)) -- initialize - (bench t . nf (fold_ f)) -- benchmark + (bench t . nf (foldString f)) -- benchmark where -- Filter out: Surrogates, Private Use Areas and unsassigned code points chars = filter isValid [minBound..maxBound] isValid c = G.generalCategory c < G.Surrogate - fold_ :: forall a. (NFData a) => (Char -> a) -> String -> () - fold_ f = foldr (deepseq . f) () + foldString :: forall a. (NFData a) => (Char -> a) -> String -> () + foldString f = foldr (deepseq . f) () + + benchNF + :: forall a b. (Bounded a, Ix a, NFData b) + => String + -> (a -> b) + -> Benchmark + benchNF t f = bench t (nf (fold_ f) (minBound, maxBound)) + + fold_ + :: forall a b. (Ix a, NFData b) + => (a -> b) + -> (a, a) + -> () + fold_ f = foldr (deepseq . f) () . range diff --git a/unicode-data/lib/Unicode/Char/General/Blocks.hs b/unicode-data/lib/Unicode/Char/General/Blocks.hs index 262adcd2..d45ec356 100644 --- a/unicode-data/lib/Unicode/Char/General/Blocks.hs +++ b/unicode-data/lib/Unicode/Char/General/Blocks.hs @@ -14,7 +14,6 @@ module Unicode.Char.General.Blocks , B.BlockDefinition(..) , block , B.blockDefinition - , inBlock , allBlockRanges , allBlockRanges' ) @@ -31,12 +30,6 @@ import qualified Unicode.Internal.Char.Blocks as B block :: Char -> Maybe B.Block block = fmap toEnum . B.block --- [TODO] @since --- | Check if a character is in a block. -{-# INLINE inBlock #-} -inBlock :: B.Block -> Char -> Bool -inBlock b = (== Just b) . block - -- [TODO] @since -- | All the block ranges, in ascending order. {-# INLINE allBlockRanges #-} diff --git a/unicode-data/test/Unicode/CharSpec.hs b/unicode-data/test/Unicode/CharSpec.hs index b38db486..92c46f34 100644 --- a/unicode-data/test/Unicode/CharSpec.hs +++ b/unicode-data/test/Unicode/CharSpec.hs @@ -53,13 +53,13 @@ spec = do Just _ -> pure () Nothing -> UChar.generalCategory c `shouldBe` UChar.NotAssigned } in traverse_ check [minBound..maxBound] - it "Characters are in the definition of their corresponding block + inBlock" + it "Characters are in the definition of their corresponding block" let { check c = case UBlocks.block c of Nothing -> pure () Just b -> let r = UBlocks.blockRange (UBlocks.blockDefinition b) - in if inRange r (UChar.ord c) && UBlocks.inBlock b c + in if inRange r (UChar.ord c) then pure () else expectationFailure $ mconcat [ "Character “", show c From c591d867e3a3445947c7bb57b7b48765d9ebe33a Mon Sep 17 00:00:00 2001 From: Pierre Le Marre Date: Mon, 19 Sep 2022 12:14:09 +0200 Subject: [PATCH 2/2] Remove inScript and add scriptDefinition benchmark. --- unicode-data-scripts/bench/Main.hs | 31 +++++++-- .../lib/Unicode/Char/General/Scripts.hs | 7 -- .../test/Unicode/Char/General/ScriptsSpec.hs | 66 ++++++++++++++++--- 3 files changed, 82 insertions(+), 22 deletions(-) diff --git a/unicode-data-scripts/bench/Main.hs b/unicode-data-scripts/bench/Main.hs index 4700b7ce..cc653dad 100644 --- a/unicode-data-scripts/bench/Main.hs +++ b/unicode-data-scripts/bench/Main.hs @@ -1,5 +1,6 @@ import Control.DeepSeq (NFData, deepseq, force) import Control.Exception (evaluate) +import Data.Ix (Ix(..)) import Test.Tasty.Bench (Benchmark, bgroup, bench, defaultMain, env, nf) @@ -10,22 +11,38 @@ main :: IO () main = defaultMain [ bgroup "Unicode.Char.General.Script" [ bgroup "script" - [ benchNF "unicode-data" (show . S.script) + [ benchChars "unicode-data" (show . S.script) + ] + , bgroup "scriptDefinition" + [ benchNF "unicode-data" (show . S.scriptDefinition) ] - -- [TODO] scriptDefinition, inScript ] ] where - benchNF :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark - benchNF t f = + benchChars :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark + benchChars t f = -- Avoid side-effects with garbage collection (see tasty-bench doc) env (evaluate (force chars)) -- initialize - (bench t . nf (fold_ f)) -- benchmark + (bench t . nf (foldString f)) -- benchmark where -- Filter out: Surrogates, Private Use Areas and unsassigned code points chars = filter isValid [minBound..maxBound] isValid c = G.generalCategory c < G.Surrogate - fold_ :: forall a. (NFData a) => (Char -> a) -> String -> () - fold_ f = foldr (deepseq . f) () + foldString :: forall a. (NFData a) => (Char -> a) -> String -> () + foldString f = foldr (deepseq . f) () + + benchNF + :: forall a b. (Bounded a, Ix a, NFData b) + => String + -> (a -> b) + -> Benchmark + benchNF t f = bench t (nf (fold_ f) (minBound, maxBound)) + + fold_ + :: forall a b. (Ix a, NFData b) + => (a -> b) + -> (a, a) + -> () + fold_ f = foldr (deepseq . f) () . range diff --git a/unicode-data-scripts/lib/Unicode/Char/General/Scripts.hs b/unicode-data-scripts/lib/Unicode/Char/General/Scripts.hs index a4bbabab..a4f9b355 100644 --- a/unicode-data-scripts/lib/Unicode/Char/General/Scripts.hs +++ b/unicode-data-scripts/lib/Unicode/Char/General/Scripts.hs @@ -15,7 +15,6 @@ module Unicode.Char.General.Scripts ( S.Script(..) , script , scriptDefinition - , inScript ) where @@ -39,12 +38,6 @@ import qualified Unicode.Internal.Char.Scripts as S script :: Char -> S.Script script = toEnum . S.script --- [TODO] @since --- | Check if a character is in a 'S.Script'. -{-# INLINE inScript #-} -inScript :: S.Script -> Char -> Bool -inScript s = (== s) . script - {- HLINT ignore scriptDefinition "Eta reduce" -} -- [TODO] @since -- | Characters correspinding to a 'S.Script'. diff --git a/unicode-data-scripts/test/Unicode/Char/General/ScriptsSpec.hs b/unicode-data-scripts/test/Unicode/Char/General/ScriptsSpec.hs index efd3ff4e..f3b68191 100644 --- a/unicode-data-scripts/test/Unicode/Char/General/ScriptsSpec.hs +++ b/unicode-data-scripts/test/Unicode/Char/General/ScriptsSpec.hs @@ -1,12 +1,26 @@ -{-# LANGUAGE BlockArguments #-} +{-# LANGUAGE BlockArguments, CPP #-} module Unicode.Char.General.ScriptsSpec ( spec ) where -import qualified Unicode.Char.General.Scripts as UScripts import Data.Foldable (traverse_) import Test.Hspec +import qualified Unicode.Char.General.Scripts as UScripts +import qualified Unicode.Internal.Char.Scripts as S + +import GHC.Exts + (Ptr(..), Char(..), Int(..), + indexWord32OffAddr#, int2Word#, + and#, isTrue#, eqWord#, leWord#, neWord#, + andI#, (-#), (<#), + ord#) +#if MIN_VERSION_base(4,16,0) +import GHC.Exts (word32ToWord#) +#endif +#ifdef WORDS_BIGENDIAN +import GHC.Exts (byteSwap32#) +#endif {- [NOTE] These tests may fail if the compiler’s Unicode version @@ -26,14 +40,11 @@ does not match the version of this package. spec :: Spec spec = do describe "Unicode scripts" do - it "inScript" - let check s = if all (UScripts.inScript s) (UScripts.scriptDefinition s) - then pure () - else expectationFailure (show s) - in traverse_ check [minBound..maxBound] it "Characters are in the definition of their corresponding script" let { - check c = let s = UScripts.script c in if UScripts.inScript s c + check c = + let s = UScripts.script c + in if s `inScript` c then pure () else expectationFailure $ mconcat [ "Char “", show c, "” in not in the definition of “" @@ -49,3 +60,42 @@ spec = do check s = let chars = UScripts.scriptDefinition s in traverse_ (checkChar s) chars } in traverse_ check [minBound..maxBound] + +{- HLINT ignore inScript "Eta reduce" -} +-- Check if a character is in a 'S.Script'. +-- This is faster than testing the string from UScripts.scriptDefinition +inScript :: S.Script -> Char -> Bool +inScript s (C# c#) = check (S.scriptDefinition s) + where + -- [NOTE] see 'scriptDefinition' for the description of the encoding. + + scriptRangeMask# = 0x80000000## -- 1 << 31 + maskComplement# = 0x7fffffff## -- 1 << 31 ^ 0xffffffff + cp# = int2Word# (ord# c#) + + check (Ptr addr#, I# n#) = let { + getRawCodePoint k# = +#ifdef WORDS_BIGENDIAN +#if MIN_VERSION_base(4,16,0) + byteSwap32# (word32ToWord# (indexWord32OffAddr# addr# k#)); +#else + byteSwap32# (indexWord32OffAddr# addr# k#); +#endif +#elif MIN_VERSION_base(4,16,0) + word32ToWord# (indexWord32OffAddr# addr# k#); +#else + indexWord32OffAddr# addr# k#; +#endif + getCodePoint k# = and# maskComplement# k#; + find k# = not (isTrue# (k# <# 0#)) && + let { + r1# = getRawCodePoint k#; + c1# = getCodePoint r1#; + isRange = isTrue# (and# r1# scriptRangeMask# `neWord#` 0##) + } in if isRange + then let { + c2# = getCodePoint (getRawCodePoint (k# -# 1#)); + found = isTrue# ((c2# `leWord#` cp#) `andI#` (cp# `leWord#` c1#)) + } in found || find (k# -# 2#) + else isTrue# (c1# `eqWord#` cp#) || find (k# -# 1#) + } in find (n# -# 1#)