Skip to content

Commit

Permalink
[segmenter] add support for word segmentation
Browse files Browse the repository at this point in the history
  • Loading branch information
benoitkugler committed Jan 19, 2024
1 parent 508cc15 commit ce7f758
Show file tree
Hide file tree
Showing 3 changed files with 939 additions and 56 deletions.
174 changes: 119 additions & 55 deletions segmenter/segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,27 @@ import (
ucd "github.com/go-text/typesetting/unicodedata"
)

// Break is a flag storing the break properties between two runes of
// breakAttr is a flag storing the break properties between two runes of
// the input text.
type Break uint8
type breakAttr uint8

const (
LineBoundary Break = 1 << iota
MandatoryLineBoundary // implies LineBoundary
lineBoundary breakAttr = 1 << iota
mandatoryLineBoundary // implies LineBoundary

// GraphemeBoundary is on if the cursor can appear in front of a character,
// graphemeBoundary is on if the cursor can appear in front of a character,
// i.e. if we are at a grapheme boundary.
GraphemeBoundary
graphemeBoundary

// WordBoundary is on if we are at the beginning or end of a word.
// wordBoundary is on if we are at the beginning or end of a word.
//
// To actually detect words, you should also look for runes
// with the [Alphabetic] property, or with a General_Category of Number.
// See also http://unicode.org/reports/tr44/#Alphabetic and
//
// See also https://unicode.org/reports/tr29/#Word_Boundary_Rules,
// http://unicode.org/reports/tr44/#Alphabetic and
// http://unicode.org/reports/tr44/#General_Category_Values
WordBoundary
wordBoundary
)

const paragraphSeparator rune = 0x2029
Expand Down Expand Up @@ -123,7 +125,7 @@ func newCursor(text []rune) *cursor {
return &cr
}

// ComputeBreakAttributes does the heavy lifting of text segmentation,
// computeBreakAttributes does the heavy lifting of text segmentation,
// by computing a break attribute for each rune.
//
// More precisely, `attributes` must be a slice of length len(text)+1,
Expand All @@ -132,7 +134,7 @@ func newCursor(text []rune) *cursor {
//
// Unicode defines a lot of properties; for now we only handle
// grapheme, word and line breaking.
func ComputeBreakAttributes(text []rune, attributes []Break) {
func computeBreakAttributes(text []rune, attributes []breakAttr) {
// The rules are somewhat complex, but the general logic is pretty simple:
// iterate through the input slice, fetch context information
// from previous and following runes required by the rules,
Expand All @@ -146,21 +148,21 @@ func ComputeBreakAttributes(text []rune, attributes []Break) {
for i := 0; i <= len(text); i++ { // note that we accept i == len(text) to fill the last attribute
cr.startIteration(text, i)

var attr Break
var attr breakAttr

// UAX#29 Grapheme and word Boundaries

isGraphemeBoundary := cr.applyGraphemeBoundaryRules()
if isGraphemeBoundary {
attr |= GraphemeBoundary
attr |= graphemeBoundary
}

isWordBoundary, removePrevNoExtend := cr.applyWordBoundaryRules(i)
if isWordBoundary {
attr |= WordBoundary
attr |= wordBoundary
}
if removePrevNoExtend {
attributes[cr.prevWordNoExtend] &^= WordBoundary
attributes[cr.prevWordNoExtend] &^= wordBoundary
}

// UAX#14 Line Breaking
Expand All @@ -169,14 +171,14 @@ func ComputeBreakAttributes(text []rune, attributes []Break) {
switch bo {
case breakEmpty:
// rule LB31 : default to allow line break
attr |= LineBoundary
attr |= lineBoundary
case breakProhibited:
attr &^= LineBoundary
attr &^= lineBoundary
case breakAllowed:
attr |= LineBoundary
attr |= lineBoundary
case breakMandatory:
attr |= LineBoundary
attr |= MandatoryLineBoundary
attr |= lineBoundary
attr |= mandatoryLineBoundary
}

cr.endIteration(i == 0)
Expand All @@ -186,14 +188,14 @@ func ComputeBreakAttributes(text []rune, attributes []Break) {

// start and end of the paragraph are always
// grapheme boundaries and word boundaries
attributes[0] |= GraphemeBoundary | WordBoundary // Rule GB1 and WB1
attributes[len(text)] |= GraphemeBoundary | WordBoundary // Rule GB2 and WB2
attributes[0] |= graphemeBoundary | wordBoundary // Rule GB1 and WB1
attributes[len(text)] |= graphemeBoundary | wordBoundary // Rule GB2 and WB2

// never break before the first char,
// but always break after the last
attributes[0] &^= LineBoundary // Rule LB2
attributes[len(text)] |= LineBoundary // Rule LB3
attributes[len(text)] |= MandatoryLineBoundary // Rule LB3
attributes[0] &^= lineBoundary // Rule LB2
attributes[len(text)] |= lineBoundary // Rule LB3
attributes[len(text)] |= mandatoryLineBoundary // Rule LB3
}

// Segmenter is the entry point of the package.
Expand All @@ -210,36 +212,36 @@ type Segmenter struct {
text []rune
// with length len(text) + 1 :
// the attribute at indice i is about the
// rune at i-1 and i
// See also `computeAttributes`
// rune at i-1 and i.
// See also [ComputeBreakAttributes]
// Example :
// text : [b, u, l, l]
// attributes : [<start> b, b u, u l, l l, l <end>]
attributes []Break
attributes []breakAttr
}

// Init resets the segmenter storage with the given input,
// and computes the attributes required to segment the text.
func (seg *Segmenter) Init(paragraph []rune) {
seg.text = append(seg.text[:0], paragraph...)
seg.attributes = append(seg.attributes[:0], make([]Break, len(paragraph)+1)...)
ComputeBreakAttributes(seg.text, seg.attributes)
seg.attributes = append(seg.attributes[:0], make([]breakAttr, len(paragraph)+1)...)
computeBreakAttributes(seg.text, seg.attributes)
}

// attributeIterator is an helper type used to
// handle iterating over a slice of runeAttr
type attributeIterator struct {
src *Segmenter
pos int // the current position in the input slice
lastBreak int // the start of the current segment
flag Break // break where this flag is on
pos int // the current position in the input slice
lastBreak int // the start of the current segment
flag breakAttr // break where this flag is on
}

// next returns true if there is still a segment to process,
// and advances the iterator; or return false.
// if returning true, the segment it at li.lastBreak:li.pos
// if returning true, the segment is at [iter.lastBreak:iter.pos]
func (iter *attributeIterator) next() bool {
iter.lastBreak = iter.pos // remember the start of the next line
iter.lastBreak = iter.pos // remember the start of the next segment
iter.pos++
for iter.pos <= len(iter.src.text) {
// can we break before i ?
Expand All @@ -251,6 +253,17 @@ func (iter *attributeIterator) next() bool {
return false
}

// Line is the content of a line delimited by the segmenter.
type Line struct {
// Text is a subslice of the original input slice, containing the delimited line
Text []rune
// Offset is the start of the line in the input rune slice
Offset int
// IsMandatoryBreak is true if breaking (at the end of the line)
// is mandatory
IsMandatoryBreak bool
}

// LineIterator provides a convenient way of
// iterating over the lines delimited by a `Segmenter`.
type LineIterator struct {
Expand All @@ -266,25 +279,22 @@ func (li *LineIterator) Line() Line {
return Line{
Offset: li.lastBreak,
Text: li.src.text[li.lastBreak:li.pos], // pos is not included since we break right before
IsMandatoryBreak: li.src.attributes[li.pos]&MandatoryLineBoundary != 0,
IsMandatoryBreak: li.src.attributes[li.pos]&mandatoryLineBoundary != 0,
}
}

// Line is the content of a line delimited by the segmenter.
type Line struct {
// Text is a subslice of the original input slice, containing the delimited line
Text []rune
// Offset is the start of the line in the input rune slice
Offset int
// IsMandatoryBreak is true if breaking (at the end of the line)
// is mandatory
IsMandatoryBreak bool
}

// LineIterator returns an iterator on the lines
// delimited in [Init].
func (sg *Segmenter) LineIterator() *LineIterator {
return &LineIterator{attributeIterator: attributeIterator{src: sg, flag: LineBoundary}}
return &LineIterator{attributeIterator: attributeIterator{src: sg, flag: lineBoundary}}
}

// Grapheme is the content of a grapheme delimited by the segmenter.
type Grapheme struct {
// Text is a subslice of the original input slice, containing the delimited grapheme
Text []rune
// Offset is the start of the grapheme in the input rune slice
Offset int
}

// GraphemeIterator provides a convenient way of
Expand All @@ -305,16 +315,70 @@ func (gr *GraphemeIterator) Grapheme() Grapheme {
}
}

// Line is the content of a grapheme delimited by the segmenter.
type Grapheme struct {
// Text is a subslice of the original input slice, containing the delimited grapheme
// GraphemeIterator returns an iterator over the graphemes
// delimited in [Init].
func (sg *Segmenter) GraphemeIterator() *GraphemeIterator {
return &GraphemeIterator{attributeIterator: attributeIterator{src: sg, flag: graphemeBoundary}}
}

// Word is the content of a word delimited by the segmenter.
//
// More precisely, a word is formed by runes
// with the [Alphabetic] property, or with a General_Category of Number,
// delimited by the Word Boundary Unicode Property.
//
// See also https://unicode.org/reports/tr29/#Word_Boundary_Rules,
// http://unicode.org/reports/tr44/#Alphabetic and
// http://unicode.org/reports/tr44/#General_Category_Values
type Word struct {
// Text is a subslice of the original input slice, containing the delimited word
Text []rune
// Offset is the start of the grapheme in the input rune slice
// Offset is the start of the word in the input rune slice
Offset int
}

// GraphemeIterator returns an iterator over the graphemes
type WordIterator struct {
attributeIterator

inWord bool // true if we have seen the start of a word
}

// Next returns true if there is still a word to process,
// and advances the iterator; or return false.
func (gr *WordIterator) Next() bool {
hasBoundary := gr.next()
if !hasBoundary {
return false
}

if gr.inWord { // we are have reached the END of a word
gr.inWord = false
return true
}

// do we start a word ? if so, mark it
if gr.pos < len(gr.src.text) {
gr.inWord = unicode.Is(ucd.Word, gr.src.text[gr.pos])
}
// in any case, advance again
return gr.Next()
}

// Word returns the current `Word`
func (gr *WordIterator) Word() Word {
return Word{
Offset: gr.lastBreak,
Text: gr.src.text[gr.lastBreak:gr.pos],
}
}

// WordIterator returns an iterator over the word
// delimited in [Init].
func (sg *Segmenter) GraphemeIterator() *GraphemeIterator {
return &GraphemeIterator{attributeIterator: attributeIterator{src: sg, flag: GraphemeBoundary}}
func (sg *Segmenter) WordIterator() *WordIterator {
// check is we start at a word
inWord := false
if len(sg.text) != 0 {
inWord = unicode.Is(ucd.Word, sg.text[0])
}
return &WordIterator{attributeIterator: attributeIterator{src: sg, flag: wordBoundary}, inWord: inWord}
}
29 changes: 28 additions & 1 deletion segmenter/segmenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,21 @@ func collectGraphemes(s *Segmenter, input []rune) []string {
return out
}

func collectWords(s *Segmenter, input []rune) []string {
s.Init(input)
iter := s.WordIterator()
var out []string
for iter.Next() {
out = append(out, string(iter.Word().Text))
}
return out
}

func collectWordBoundaries(s *Segmenter, input []rune) []bool {
s.Init(input)
out := make([]bool, len(s.attributes))
for i, a := range s.attributes {
out[i] = a&WordBoundary != 0
out[i] = a&wordBoundary != 0
}
return out
}
Expand Down Expand Up @@ -156,6 +166,23 @@ func TestWordBreakUnicodeReference(t *testing.T) {
}
}

func TestWordSegmenter(t *testing.T) {
var seg Segmenter
for _, test := range []struct {
input string
words []string
}{
{"My name is Cris", []string{"My", "name", "is", "Cris"}},
{"Je m'appelle Benoit.", []string{"Je", "m'appelle", "Benoit"}},
{"Hi : nice ?! suit !", []string{"Hi", "nice", "suit"}},
} {
got := collectWords(&seg, []rune(test.input))
if !reflect.DeepEqual(test.words, got) {
t.Errorf("for %s, expected %v, got %v", test.input, test.words, got)
}
}
}

func lineSegmentCount(s *Segmenter, input []rune) int {
s.Init(input)
iter := s.LineIterator()
Expand Down
Loading

0 comments on commit ce7f758

Please sign in to comment.