Skip to content

Commit

Permalink
Update to latest basic-cli
Browse files Browse the repository at this point in the history
  • Loading branch information
lukewilliamboswell committed Dec 15, 2023
1 parent 6c98f6a commit 319a9a0
Show file tree
Hide file tree
Showing 15 changed files with 6,126 additions and 6,002 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@

# Ignore the example binaries
example
examples/simple

# Ignore the generated files
generated-docs
Expand Down
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
# Work in progress!

Someday this will be a useful collection of Unicode operations, but for right now it's extremely WIP.
Someday this will be a useful collection of Unicode operations, but for right now it's extremely WIP.

## Text Segmentation `Str -> List Str`

### General Process
1. Convert to `List U8`
2. Process bytes into code points
3. Process code points into respective [GraphemeBreakProperty](https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt) e.g. U+000D -> CR, \u(11000) -> SpacingMark, \u(AC00) -> LV
4. Apply the [Graphmene Boundary](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules) rules e.g. [utf8proc implementation](https://github.com/JuliaStrings/utf8proc/blob/1cb28a66ca79a0845e99433fd1056257456cef8b/utf8proc.c#L261) or [this blog](https://halt.software/optimizing-unicodes-grapheme-cluster-break-algorithm/) ** note I like the running state version with a table lookup, might need a `List.getUnchecked` to be efficient??
5. Run through code points to get break indexes, map using `List.subList`

## Other Ideas
- This [online unicode tool](https://util.unicode.org/UnicodeJsps/breaks.jsp) looks helpful for debugging
- Write a script to parse the `.txt` files into useful `.roc` files... e.g. the [Tests](https://www.unicode.org/reports/tr41/tr41-32.html#Tests29) could be auto-generated to test our implementation of boundaries. Note notation `÷` means Break, `×` means Don't Break
- Can apply similar process for Words and Sentence boundaries
2 changes: 1 addition & 1 deletion examples/simple.roc
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
app "example"
packages {
cli: "https://github.com/roc-lang/basic-cli/releases/download/0.5.0/Cufzl36_SnJ4QbOoEmiJ5dIpUxBvdB3NEySvuH82Wio.tar.br",
cli: "https://github.com/roc-lang/basic-cli/releases/download/0.7.0/bkGby8jb0tmZYsy2hg1E_B2QrCgcSTxdUlHtETwm5m4.tar.br",
unicode: "../package/main.roc", # use release URL (ends in tar.br) for local example, see github.com/roc/unicode/releases
}
imports [
Expand Down
4 changes: 2 additions & 2 deletions package/CodePoint.roc
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ parseUtf8Help = \rest, cps ->
else
parsePartialUtf8 rest
|> Result.try \{ codePoint, bytesParsed } ->
parseUtf8Help (List.drop rest bytesParsed) (List.append cps codePoint)
parseUtf8Help (List.dropFirst rest bytesParsed) (List.append cps codePoint)

# test simple ASCII "Hello"
expect
Expand Down Expand Up @@ -358,7 +358,7 @@ cpsToStrHelp = \cps, bytes ->
[] -> bytes
[cp,..] ->
cpsToStrHelp
(List.drop cps 1)
(List.dropFirst cps 1)
(CodePoint.appendUtf8 bytes cp)

expect # test toStr
Expand Down
86 changes: 82 additions & 4 deletions package/Grapheme.roc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ interface Grapheme
CodePoint.{ CodePoint, Utf8ParseErr },
InternalGBP.{ GBP },
InternalCP,
InternalEmoji,
]

## Extended Grapheme Cluster
Expand All @@ -16,7 +17,7 @@ Grapheme : InternalGBP.GBP
# capacity here for the number of substrings
defaultCapacity = 50

## Split a string into extended graphemes clusters
## Split a string into extended grapheme clusters
##
## This typically associated with "characters" in a string, for example:
## TODO ADD EXAMPLES
Expand Down Expand Up @@ -64,7 +65,7 @@ splitHelp = \cpsWithGpbs, prevState, acc, strs ->
# Set up helper to advance to the next CP recursively and update state
advance = \nextState, nextAcc, nextStrs ->
splitHelp
(List.drop cpsWithGpbs 1)
(List.dropFirst cpsWithGpbs 1)
nextState
nextAcc
nextStrs
Expand All @@ -76,12 +77,17 @@ splitHelp = \cpsWithGpbs, prevState, acc, strs ->
# Look ahead if we have more than one CodePoint left
[current, next, ..] ->
if isExtendOrZWJOrSpacing next then

# GB11

# GB9 Do not break before extending characters, ZWJ, or spacing marks.
advance
DontBreak
(List.append acc (extractCP current))
strs



else
nextState = gbpRules prevState (extractGBP current)

Expand Down Expand Up @@ -151,7 +157,6 @@ gbpRules = \prevState, gbp ->
(_, Prepend) -> DontBreak
# GB1, GB2 Break at the start and end of text, unless the text is empty
_ -> Break


GCBPState : [
DontBreak,
Expand All @@ -163,4 +168,77 @@ GCBPState : [
AfterRI,
]


# WIP add emoji handling to text segmentation
takeGb11 : List CodePoint -> Result { acc: List CodePoint, rest : List CodePoint } [NotEmojiSequence]
takeGb11 = \cps ->
takeGb11Help
{
acc: List.withCapacity 10, # TODO is there a better default capacity?
state: Start,
rest: cps,
}

Gb11State : [Start, AfterPictographic, AfterExtend, AfterZWJ]

# State Machine for GB11 "Do not break within emoji modifier sequences or emoji zwj sequences"
takeGb11Help : { acc: List CodePoint, state: Gb11State, rest : List CodePoint} -> Result { acc: List CodePoint, rest : List CodePoint } [NotEmojiSequence]
takeGb11Help = \{ acc, state, rest } ->
when rest is
[] -> Err NotEmojiSequence
[cp, ..] ->
when state is
Start if InternalEmoji.isPictographic (CodePoint.toU32 cp) ->
{
acc: List.append acc cp,
state: AfterPictographic,
rest: List.dropFirst rest 1,
}
|> takeGb11Help # next cp
AfterPictographic if InternalGBP.isExtend (CodePoint.toU32 cp) ->
{
acc: List.append acc cp,
state: AfterExtend,
rest: List.dropFirst rest 1,
}
|> takeGb11Help # next cp
AfterPictographic if InternalGBP.isZWJ (CodePoint.toU32 cp) ->
{
acc: List.append acc cp,
state: AfterZWJ,
rest: List.dropFirst rest 1,
}
|> takeGb11Help # next cp
AfterPictographic ->
{
acc: List.append acc cp,
rest: List.dropFirst rest 1,
}
|> Ok
AfterExtend if InternalGBP.isExtend (CodePoint.toU32 cp) ->
{
acc: List.append acc cp,
state: AfterExtend,
rest: List.dropFirst rest 1,
}
|> takeGb11Help # next cp
AfterExtend if InternalGBP.isZWJ (CodePoint.toU32 cp) ->
{
acc: List.append acc cp,
state: AfterZWJ,
rest: List.dropFirst rest 1,
}
|> takeGb11Help # next cp
AfterZWJ if InternalEmoji.isPictographic (CodePoint.toU32 cp) ->
{
acc: List.append acc cp,
state: AfterPictographic,
rest: List.dropFirst rest 1,
}
|> takeGb11Help # next cp
_ ->
Err NotEmojiSequence

expect InternalEmoji.isPictographic 0x1F468
expect InternalGBP.isZWJ 0x200D


Loading

0 comments on commit 319a9a0

Please sign in to comment.