composewell · wismill · Sep 20, 2022 · Sep 19, 2022 · Sep 19, 2022
diff --git a/unicode-data-scripts/bench/Main.hs b/unicode-data-scripts/bench/Main.hs
@@ -1,5 +1,6 @@
 import Control.DeepSeq (NFData, deepseq, force)
 import Control.Exception (evaluate)
+import Data.Ix (Ix(..))
 import Test.Tasty.Bench
     (Benchmark, bgroup, bench, defaultMain, env, nf)
 
@@ -10,22 +11,38 @@ main :: IO ()
 main = defaultMain
   [ bgroup "Unicode.Char.General.Script"
     [ bgroup "script"
-      [ benchNF "unicode-data"  (show . S.script)
+      [ benchChars "unicode-data"  (show . S.script)
+      ]
+    , bgroup "scriptDefinition"
+      [ benchNF "unicode-data"  (show . S.scriptDefinition)
       ]
-    -- [TODO] scriptDefinition, inScript
     ]
   ]
   where
-    benchNF :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
-    benchNF t f =
+    benchChars :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
+    benchChars t f =
         -- Avoid side-effects with garbage collection (see tasty-bench doc)
         env
             (evaluate (force chars)) -- initialize
-            (bench t . nf (fold_ f)) -- benchmark
+            (bench t . nf (foldString f)) -- benchmark
         where
         -- Filter out: Surrogates, Private Use Areas and unsassigned code points
         chars = filter isValid [minBound..maxBound]
         isValid c = G.generalCategory c < G.Surrogate
 
-    fold_ :: forall a. (NFData a) => (Char -> a) -> String -> ()
-    fold_ f = foldr (deepseq . f) ()
+    foldString :: forall a. (NFData a) => (Char -> a) -> String -> ()
+    foldString f = foldr (deepseq . f) ()
+
+    benchNF
+        :: forall a b. (Bounded a, Ix a, NFData b)
+        => String
+        -> (a -> b)
+        -> Benchmark
+    benchNF t f = bench t (nf (fold_ f) (minBound, maxBound))
+
+    fold_
+        :: forall a b. (Ix a, NFData b)
+        => (a -> b)
+        -> (a, a)
+        -> ()
+    fold_ f = foldr (deepseq . f) () . range
diff --git a/unicode-data-scripts/lib/Unicode/Char/General/Scripts.hs b/unicode-data-scripts/lib/Unicode/Char/General/Scripts.hs
@@ -15,7 +15,6 @@ module Unicode.Char.General.Scripts
     ( S.Script(..)
     , script
     , scriptDefinition
-    , inScript
     )
 where
 
@@ -39,12 +38,6 @@ import qualified Unicode.Internal.Char.Scripts as S
 script :: Char -> S.Script
 script = toEnum . S.script
 
--- [TODO] @since
--- | Check if a character is in a 'S.Script'.
-{-# INLINE inScript #-}
-inScript :: S.Script -> Char -> Bool
-inScript s = (== s) . script
-
 {- HLINT ignore scriptDefinition "Eta reduce" -}
 -- [TODO] @since
 -- | Characters correspinding to a 'S.Script'.

diff --git a/unicode-data-scripts/test/Unicode/Char/General/ScriptsSpec.hs b/unicode-data-scripts/test/Unicode/Char/General/ScriptsSpec.hs
@@ -1,12 +1,26 @@
-{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE BlockArguments, CPP #-}
 
 module Unicode.Char.General.ScriptsSpec
   ( spec
   ) where
 
-import qualified Unicode.Char.General.Scripts as UScripts
 import Data.Foldable (traverse_)
 import Test.Hspec
+import qualified Unicode.Char.General.Scripts as UScripts
+import qualified Unicode.Internal.Char.Scripts as S
+
+import GHC.Exts
+       (Ptr(..), Char(..), Int(..),
+        indexWord32OffAddr#, int2Word#,
+        and#, isTrue#, eqWord#, leWord#, neWord#,
+        andI#, (-#), (<#),
+        ord#)
+#if MIN_VERSION_base(4,16,0)
+import GHC.Exts (word32ToWord#)
+#endif
+#ifdef WORDS_BIGENDIAN
+import GHC.Exts (byteSwap32#)
+#endif
 
 {- [NOTE]
 These tests may fail if the compiler’s Unicode version
@@ -26,14 +40,11 @@ does not match the version of this package.
 spec :: Spec
 spec = do
   describe "Unicode scripts" do
-    it "inScript"
-        let check s = if all (UScripts.inScript s) (UScripts.scriptDefinition s)
-                    then pure ()
-                    else expectationFailure (show s)
-        in traverse_ check [minBound..maxBound]
     it "Characters are in the definition of their corresponding script"
         let {
-            check c = let s = UScripts.script c in if UScripts.inScript s c
+            check c =
+                let s = UScripts.script c
+                in if s `inScript` c
                     then pure ()
                     else expectationFailure $ mconcat
                         [ "Char “", show c, "” in not in the definition of “"
@@ -49,3 +60,42 @@ spec = do
             check s = let chars = UScripts.scriptDefinition s
                       in traverse_ (checkChar s) chars
         } in traverse_ check [minBound..maxBound]
+
+{- HLINT ignore inScript "Eta reduce" -}
+-- Check if a character is in a 'S.Script'.
+-- This is faster than testing the string from UScripts.scriptDefinition
+inScript :: S.Script -> Char -> Bool
+inScript s (C# c#) = check (S.scriptDefinition s)
+    where
+    -- [NOTE] see 'scriptDefinition' for the description of the encoding.
+
+    scriptRangeMask# = 0x80000000## -- 1 << 31
+    maskComplement#  = 0x7fffffff## -- 1 << 31 ^ 0xffffffff
+    cp# = int2Word# (ord# c#)
+
+    check (Ptr addr#, I# n#) = let {
+        getRawCodePoint k# =
+#ifdef WORDS_BIGENDIAN
+#if MIN_VERSION_base(4,16,0)
+            byteSwap32# (word32ToWord# (indexWord32OffAddr# addr# k#));
+#else
+            byteSwap32# (indexWord32OffAddr# addr# k#);
+#endif
+#elif MIN_VERSION_base(4,16,0)
+            word32ToWord# (indexWord32OffAddr# addr# k#);
+#else
+            indexWord32OffAddr# addr# k#;
+#endif
+        getCodePoint k# = and# maskComplement# k#;
+        find k# = not (isTrue# (k# <# 0#)) &&
+            let {
+                r1# = getRawCodePoint k#;
+                c1# = getCodePoint r1#;
+                isRange = isTrue# (and# r1# scriptRangeMask# `neWord#` 0##)
+            } in if isRange
+                then let {
+                    c2# = getCodePoint (getRawCodePoint (k# -# 1#));
+                    found = isTrue# ((c2# `leWord#` cp#) `andI#` (cp# `leWord#` c1#))
+                } in found || find (k# -# 2#)
+                else isTrue# (c1# `eqWord#` cp#) || find (k# -# 1#)
+    } in find (n# -# 1#)
diff --git a/unicode-data/bench/Main.hs b/unicode-data/bench/Main.hs
@@ -1,5 +1,6 @@
 import Control.DeepSeq (NFData, deepseq, force)
 import Control.Exception (evaluate)
+import Data.Ix (Ix(..))
 import Test.Tasty.Bench
     (Benchmark, bgroup, bench, bcompare, defaultMain, env, nf)
 
@@ -24,22 +25,22 @@ main :: IO ()
 main = defaultMain
   [ bgroup "Unicode.Char.Case"
     [ bgroup "isLowerCase"
-      [ benchNF "unicode-data" C.isLowerCase
+      [ benchChars "unicode-data" C.isLowerCase
       ]
     , bgroup "isUpperCase"
-      [ benchNF "unicode-data" C.isUpperCase
+      [ benchChars "unicode-data" C.isUpperCase
       ]
     , bgroup "toCaseFoldString"
-      [ benchNF "unicode-data" C.toCaseFoldString
+      [ benchChars "unicode-data" C.toCaseFoldString
       ]
     , bgroup "toLowerString"
-      [ benchNF "unicode-data" C.toLowerString
+      [ benchChars "unicode-data" C.toLowerString
       ]
     , bgroup "toTitleString"
-      [ benchNF "unicode-data" C.toTitleString
+      [ benchChars "unicode-data" C.toTitleString
       ]
     , bgroup "toUpperString"
-      [ benchNF "unicode-data" C.toUpperString
+      [ benchChars "unicode-data" C.toUpperString
       ]
     ]
   , bgroup "Unicode.Char.Case.Compat"
@@ -71,7 +72,7 @@ main = defaultMain
       , Bench "unicode-data"  (show . G.generalCategory)
       ]
     , bgroup "isAlphabetic"
-      [ benchNF "unicode-data"  G.isAlphabetic
+      [ benchChars "unicode-data"  G.isAlphabetic
       ]
     , bgroup' "isAlphaNum"
       [ Bench "base"          Char.isAlphaNum
@@ -102,35 +103,37 @@ main = defaultMain
       , Bench "unicode-data"  G.isSymbol
       ]
     , bgroup "isWhiteSpace"
-      [ benchNF "unicode-data"  G.isWhiteSpace
+      [ benchChars "unicode-data"  G.isWhiteSpace
       ]
     -- Korean Hangul Characters
     , bgroup "isHangul"
-      [ benchNF "unicode-data"  G.isHangul
+      [ benchChars "unicode-data"  G.isHangul
       ]
     , bgroup "isHangulLV"
-      [ benchNF "unicode-data"  G.isHangul
+      [ benchChars "unicode-data"  G.isHangul
       ]
     , bgroup "isJamo"
-      [ benchNF "unicode-data"  G.isJamo
+      [ benchChars "unicode-data"  G.isJamo
       ]
     , bgroup "jamoLIndex"
-      [ benchNF "unicode-data"  G.jamoLIndex
+      [ benchChars "unicode-data"  G.jamoLIndex
       ]
     , bgroup "jamoVIndex"
-      [ benchNF "unicode-data"  G.jamoVIndex
+      [ benchChars "unicode-data"  G.jamoVIndex
       ]
     , bgroup "jamoTIndex"
-      [ benchNF "unicode-data"  G.jamoTIndex
+      [ benchChars "unicode-data"  G.jamoTIndex
       ]
     ]
   , bgroup "Unicode.Char.General.Blocks"
     [ bgroup "block"
-      [ benchNF "unicode-data"  (show . B.block)
+      [ benchChars "unicode-data"  (show . B.block)
+      ]
+    , bgroup "blockDefinition"
+      [ benchNF "unicode-data"  (show . B.blockDefinition)
       ]
-    -- [TODO] blockDefinition, inBlock
     , bgroup "allBlockRanges"
-      [ benchNF "unicode-data"  (const B.allBlockRanges)
+      [ benchChars "unicode-data"  (const B.allBlockRanges)
       ]
     ]
   , bgroup "Unicode.Char.General.Compat"
@@ -149,66 +152,66 @@ main = defaultMain
     ]
   , bgroup "Unicode.Char.Identifiers"
     [ bgroup "isIDContinue"
-      [ benchNF "unicode-data"  I.isIDContinue
+      [ benchChars "unicode-data"  I.isIDContinue
       ]
     , bgroup "isIDStart"
-      [ benchNF "unicode-data"  I.isIDStart
+      [ benchChars "unicode-data"  I.isIDStart
       ]
     , bgroup "isXIDContinue"
-      [ benchNF "unicode-data"  I.isXIDContinue
+      [ benchChars "unicode-data"  I.isXIDContinue
       ]
     , bgroup "isXIDStart"
-      [ benchNF "unicode-data"  I.isXIDStart
+      [ benchChars "unicode-data"  I.isXIDStart
       ]
     , bgroup "isPatternSyntax"
-      [ benchNF "unicode-data"  I.isPatternSyntax
+      [ benchChars "unicode-data"  I.isPatternSyntax
       ]
     , bgroup "isPatternWhitespace"
-      [ benchNF "unicode-data"  I.isPatternWhitespace
+      [ benchChars "unicode-data"  I.isPatternWhitespace
       ]
     ]
   , bgroup "Unicode.Char.Normalization"
     [ bgroup "isCombining"
-      [ benchNF "unicode-data"  N.isCombining
+      [ benchChars "unicode-data"  N.isCombining
       ]
     , bgroup "combiningClass"
-      [ benchNF "unicode-data"  N.combiningClass
+      [ benchChars "unicode-data"  N.combiningClass
       ]
     , bgroup "isCombiningStarter"
-      [ benchNF "unicode-data"  N.isCombiningStarter
+      [ benchChars "unicode-data"  N.isCombiningStarter
       ]
     -- [TODO] compose, composeStarters
     , bgroup "isDecomposable"
       [ bgroup "Canonical"
-        [ benchNF "unicode-data" (N.isDecomposable N.Canonical)
+        [ benchChars "unicode-data" (N.isDecomposable N.Canonical)
         ]
       , bgroup "Kompat"
-        [ benchNF "unicode-data" (N.isDecomposable N.Kompat)
+        [ benchChars "unicode-data" (N.isDecomposable N.Kompat)
         ]
       ]
     -- [FIXME] Fail due to non-exhaustive pattern matching
     -- , bgroup "decompose"
     --   [ bgroup "Canonical"
-    --     [ benchNF "unicode-data" (N.decompose N.Canonical)
+    --     [ benchChars "unicode-data" (N.decompose N.Canonical)
     --     ]
     --   , bgroup "Kompat"
-    --     [ benchNF "unicode-data" (N.decompose N.Kompat)
+    --     [ benchChars "unicode-data" (N.decompose N.Kompat)
     --     ]
     --   ]
     , bgroup "decomposeHangul"
-      [ benchNF "unicode-data" N.decomposeHangul
+      [ benchChars "unicode-data" N.decomposeHangul
       ]
     ]
   , bgroup "Unicode.Char.Numeric"
     -- [TODO] Replace with 'isNumber' once the migration is done.
     [ bgroup "isNumeric"
-      [ benchNF "unicode-data"  Num.isNumeric
+      [ benchChars "unicode-data"  Num.isNumeric
       ]
     , bgroup "numericValue"
-      [ benchNF "unicode-data" Num.numericValue
+      [ benchChars "unicode-data" Num.numericValue
       ]
     , bgroup "integerValue"
-      [ benchNF "unicode-data" Num.integerValue
+      [ benchChars "unicode-data" Num.integerValue
       ]
     ]
   , bgroup "Unicode.Char.Numeric.Compat"
@@ -220,26 +223,40 @@ main = defaultMain
   ]
   where
     bgroup' groupTitle bs = bgroup groupTitle
-      [ benchNF' groupTitle title f
+      [ benchChars' groupTitle title f
       | Bench title f <- bs
       ]
 
     -- [NOTE] Works if groupTitle uniquely identifies the benchmark group.
-    benchNF' groupTitle title = case title of
-      "base" -> benchNF title
+    benchChars' groupTitle title = case title of
+      "base" -> benchChars title
       _      -> bcompare ("$NF == \"base\" && $(NF-1) == \"" ++ groupTitle ++ "\"")
-              . benchNF title
+              . benchChars title
 
-    benchNF :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
-    benchNF t f =
+    benchChars :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
+    benchChars t f =
         -- Avoid side-effects with garbage collection (see tasty-bench doc)
         env
             (evaluate (force chars)) -- initialize
-            (bench t . nf (fold_ f)) -- benchmark
+            (bench t . nf (foldString f)) -- benchmark
         where
         -- Filter out: Surrogates, Private Use Areas and unsassigned code points
         chars = filter isValid [minBound..maxBound]
         isValid c = G.generalCategory c < G.Surrogate
 
-    fold_ :: forall a. (NFData a) => (Char -> a) -> String -> ()
-    fold_ f = foldr (deepseq . f) ()
+    foldString :: forall a. (NFData a) => (Char -> a) -> String -> ()
+    foldString f = foldr (deepseq . f) ()
+
+    benchNF
+        :: forall a b. (Bounded a, Ix a, NFData b)
+        => String
+        -> (a -> b)
+        -> Benchmark
+    benchNF t f = bench t (nf (fold_ f) (minBound, maxBound))
+
+    fold_
+        :: forall a b. (Ix a, NFData b)
+        => (a -> b)
+        -> (a, a)
+        -> ()
+    fold_ f = foldr (deepseq . f) () . range