Update to latest basic-cli

roc-lang · Dec 15, 2023 · 319a9a0 · 319a9a0
1 parent 6c98f6a
commit 319a9a0
Show file tree

Hide file tree

Showing 15 changed files with 6,126 additions and 6,002 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
 
 # Ignore the example binaries
-example
+examples/simple
 
 # Ignore the generated files
 generated-docs

diff --git a/README.md b/README.md
@@ -1,3 +1,17 @@
 # Work in progress!
 
-Someday this will be a useful collection of Unicode operations, but for right now it's extremely WIP.
+Someday this will be a useful collection of Unicode operations, but for right now it's extremely WIP.
+
+## Text Segmentation `Str -> List Str`
+
+### General Process
+1. Convert to `List U8`
+2. Process bytes into code points 
+3. Process code points into respective [GraphemeBreakProperty](https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt) e.g. U+000D -> CR, \u(11000) -> SpacingMark, \u(AC00) -> LV
+4. Apply the [Graphmene Boundary](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules) rules e.g. [utf8proc implementation](https://github.com/JuliaStrings/utf8proc/blob/1cb28a66ca79a0845e99433fd1056257456cef8b/utf8proc.c#L261) or [this blog](https://halt.software/optimizing-unicodes-grapheme-cluster-break-algorithm/) ** note I like the running state version with a table lookup, might need a `List.getUnchecked` to be efficient?? 
+5. Run through code points to get break indexes, map using `List.subList`
+
+## Other Ideas
+- This [online unicode tool](https://util.unicode.org/UnicodeJsps/breaks.jsp) looks helpful for debugging
+- Write a script to parse the `.txt` files into useful `.roc` files... e.g. the [Tests](https://www.unicode.org/reports/tr41/tr41-32.html#Tests29) could be auto-generated to test our implementation of boundaries. Note notation `÷` means Break, `×` means Don't Break
+- Can apply similar process for Words and Sentence boundaries
diff --git a/examples/simple.roc b/examples/simple.roc
@@ -1,6 +1,6 @@
 app "example"
     packages {
-        cli: "https://github.com/roc-lang/basic-cli/releases/download/0.5.0/Cufzl36_SnJ4QbOoEmiJ5dIpUxBvdB3NEySvuH82Wio.tar.br",
+        cli: "https://github.com/roc-lang/basic-cli/releases/download/0.7.0/bkGby8jb0tmZYsy2hg1E_B2QrCgcSTxdUlHtETwm5m4.tar.br",
         unicode: "../package/main.roc", # use release URL (ends in tar.br) for local example, see github.com/roc/unicode/releases
     }
     imports [

diff --git a/package/CodePoint.roc b/package/CodePoint.roc
@@ -266,7 +266,7 @@ parseUtf8Help = \rest, cps ->
     else
         parsePartialUtf8 rest
         |> Result.try \{ codePoint, bytesParsed } ->
-            parseUtf8Help (List.drop rest bytesParsed) (List.append cps codePoint)
+            parseUtf8Help (List.dropFirst rest bytesParsed) (List.append cps codePoint)
 
 # test simple ASCII "Hello"
 expect
@@ -358,7 +358,7 @@ cpsToStrHelp = \cps, bytes ->
         [] -> bytes
         [cp,..] -> 
             cpsToStrHelp 
-                (List.drop cps 1)
+                (List.dropFirst cps 1)
                 (CodePoint.appendUtf8 bytes cp)
 
 expect # test toStr 

diff --git a/package/Grapheme.roc b/package/Grapheme.roc
@@ -7,6 +7,7 @@ interface Grapheme
         CodePoint.{ CodePoint, Utf8ParseErr },
         InternalGBP.{ GBP },
         InternalCP,
+        InternalEmoji,
     ]
 
 ## Extended Grapheme Cluster
@@ -16,7 +17,7 @@ Grapheme : InternalGBP.GBP
 # capacity here for the number of substrings
 defaultCapacity = 50
 
-## Split a string into extended graphemes clusters
+## Split a string into extended grapheme clusters
 ## 
 ## This typically associated with "characters" in a string, for example:
 ## TODO ADD EXAMPLES
@@ -64,7 +65,7 @@ splitHelp = \cpsWithGpbs, prevState, acc, strs ->
     # Set up helper to advance to the next CP recursively and update state 
     advance = \nextState, nextAcc, nextStrs -> 
         splitHelp
-            (List.drop cpsWithGpbs 1)
+            (List.dropFirst cpsWithGpbs 1)
             nextState
             nextAcc
             nextStrs
@@ -76,12 +77,17 @@ splitHelp = \cpsWithGpbs, prevState, acc, strs ->
         # Look ahead if we have more than one CodePoint left
         [current, next, ..] ->
             if isExtendOrZWJOrSpacing next then 
+
+                # GB11
+
                 # GB9 Do not break before extending characters, ZWJ, or spacing marks.
                 advance
                     DontBreak
                     (List.append acc (extractCP current))
                     strs
 
+
+
             else 
                 nextState = gbpRules prevState (extractGBP current)
 
@@ -151,7 +157,6 @@ gbpRules = \prevState, gbp ->
         (_, Prepend) -> DontBreak
         # GB1, GB2 Break at the start and end of text, unless the text is empty
         _ -> Break
-
 
 GCBPState : [
     DontBreak,
@@ -163,4 +168,77 @@ GCBPState : [
     AfterRI,
 ]
 
-
+# WIP add emoji handling to text segmentation
+takeGb11 : List CodePoint -> Result { acc: List CodePoint, rest : List CodePoint } [NotEmojiSequence] 
+takeGb11 = \cps ->
+    takeGb11Help 
+        {
+            acc: List.withCapacity 10, # TODO is there a better default capacity?
+            state: Start,
+            rest: cps,
+        }
+
+Gb11State : [Start, AfterPictographic, AfterExtend, AfterZWJ]
+
+# State Machine for GB11 "Do not break within emoji modifier sequences or emoji zwj sequences"
+takeGb11Help : { acc: List CodePoint, state: Gb11State, rest : List CodePoint} -> Result { acc: List CodePoint, rest : List CodePoint } [NotEmojiSequence] 
+takeGb11Help = \{ acc, state, rest } ->
+    when rest is 
+        [] -> Err NotEmojiSequence
+        [cp, ..] ->
+            when state is 
+                Start if InternalEmoji.isPictographic (CodePoint.toU32 cp) -> 
+                    {
+                        acc: List.append acc cp,
+                        state: AfterPictographic,
+                        rest: List.dropFirst rest 1,
+                    }
+                    |> takeGb11Help # next cp 
+                AfterPictographic if InternalGBP.isExtend (CodePoint.toU32 cp) -> 
+                    {
+                        acc: List.append acc cp,
+                        state: AfterExtend,
+                        rest: List.dropFirst rest 1,
+                    }
+                    |> takeGb11Help # next cp 
+                AfterPictographic if InternalGBP.isZWJ (CodePoint.toU32 cp) -> 
+                    {
+                        acc: List.append acc cp,
+                        state: AfterZWJ,
+                        rest: List.dropFirst rest 1,
+                    }
+                    |> takeGb11Help # next cp 
+                AfterPictographic -> 
+                    {
+                        acc: List.append acc cp,
+                        rest: List.dropFirst rest 1,
+                    }
+                    |> Ok
+                AfterExtend if InternalGBP.isExtend (CodePoint.toU32 cp) -> 
+                    {
+                        acc: List.append acc cp,
+                        state: AfterExtend,
+                        rest: List.dropFirst rest 1,
+                    }
+                    |> takeGb11Help # next cp 
+                AfterExtend if InternalGBP.isZWJ (CodePoint.toU32 cp) -> 
+                    {
+                        acc: List.append acc cp,
+                        state: AfterZWJ,
+                        rest: List.dropFirst rest 1,
+                    }
+                    |> takeGb11Help # next cp 
+                AfterZWJ if InternalEmoji.isPictographic (CodePoint.toU32 cp) -> 
+                    {
+                        acc: List.append acc cp,
+                        state: AfterPictographic,
+                        rest: List.dropFirst rest 1,
+                    }
+                    |> takeGb11Help # next cp 
+                _ -> 
+                    Err NotEmojiSequence
+
+expect InternalEmoji.isPictographic 0x1F468
+expect InternalGBP.isZWJ 0x200D
+
+