[segmenter] add support for word segmentation

go-text · Jan 19, 2024 · ce7f758 · ce7f758
1 parent 508cc15
commit ce7f758
Show file tree

Hide file tree

Showing 3 changed files with 939 additions and 56 deletions.
diff --git a/segmenter/segmenter.go b/segmenter/segmenter.go
@@ -18,25 +18,27 @@ import (
 	ucd "github.com/go-text/typesetting/unicodedata"
 )
 
-// Break is a flag storing the break properties between two runes of
+// breakAttr is a flag storing the break properties between two runes of
 // the input text.
-type Break uint8
+type breakAttr uint8
 
 const (
-	LineBoundary          Break = 1 << iota
-	MandatoryLineBoundary       // implies LineBoundary
+	lineBoundary          breakAttr = 1 << iota
+	mandatoryLineBoundary           // implies LineBoundary
 
-	// GraphemeBoundary is on if the cursor can appear in front of a character,
+	// graphemeBoundary is on if the cursor can appear in front of a character,
 	// i.e. if we are at a grapheme boundary.
-	GraphemeBoundary
+	graphemeBoundary
 
-	// WordBoundary is on if we are at the beginning or end of a word.
+	// wordBoundary is on if we are at the beginning or end of a word.
 	//
 	// To actually detect words, you should also look for runes
 	// with the [Alphabetic] property, or with a General_Category of Number.
-	// See also http://unicode.org/reports/tr44/#Alphabetic and
+	//
+	// See also https://unicode.org/reports/tr29/#Word_Boundary_Rules,
+	// http://unicode.org/reports/tr44/#Alphabetic and
 	// http://unicode.org/reports/tr44/#General_Category_Values
-	WordBoundary
+	wordBoundary
 )
 
 const paragraphSeparator rune = 0x2029
@@ -123,7 +125,7 @@ func newCursor(text []rune) *cursor {
 	return &cr
 }
 
-// ComputeBreakAttributes does the heavy lifting of text segmentation,
+// computeBreakAttributes does the heavy lifting of text segmentation,
 // by computing a break attribute for each rune.
 //
 // More precisely, `attributes` must be a slice of length len(text)+1,
@@ -132,7 +134,7 @@ func newCursor(text []rune) *cursor {
 //
 // Unicode defines a lot of properties; for now we only handle
 // grapheme, word and line breaking.
-func ComputeBreakAttributes(text []rune, attributes []Break) {
+func computeBreakAttributes(text []rune, attributes []breakAttr) {
 	// The rules are somewhat complex, but the general logic is pretty simple:
 	// iterate through the input slice, fetch context information
 	// from previous and following runes required by the rules,
@@ -146,21 +148,21 @@ func ComputeBreakAttributes(text []rune, attributes []Break) {
 	for i := 0; i <= len(text); i++ { // note that we accept i == len(text) to fill the last attribute
 		cr.startIteration(text, i)
 
-		var attr Break
+		var attr breakAttr
 
 		// UAX#29 Grapheme and word Boundaries
 
 		isGraphemeBoundary := cr.applyGraphemeBoundaryRules()
 		if isGraphemeBoundary {
-			attr |= GraphemeBoundary
+			attr |= graphemeBoundary
 		}
 
 		isWordBoundary, removePrevNoExtend := cr.applyWordBoundaryRules(i)
 		if isWordBoundary {
-			attr |= WordBoundary
+			attr |= wordBoundary
 		}
 		if removePrevNoExtend {
-			attributes[cr.prevWordNoExtend] &^= WordBoundary
+			attributes[cr.prevWordNoExtend] &^= wordBoundary
 		}
 
 		// UAX#14 Line Breaking
@@ -169,14 +171,14 @@ func ComputeBreakAttributes(text []rune, attributes []Break) {
 		switch bo {
 		case breakEmpty:
 			// rule LB31 : default to allow line break
-			attr |= LineBoundary
+			attr |= lineBoundary
 		case breakProhibited:
-			attr &^= LineBoundary
+			attr &^= lineBoundary
 		case breakAllowed:
-			attr |= LineBoundary
+			attr |= lineBoundary
 		case breakMandatory:
-			attr |= LineBoundary
-			attr |= MandatoryLineBoundary
+			attr |= lineBoundary
+			attr |= mandatoryLineBoundary
 		}
 
 		cr.endIteration(i == 0)
@@ -186,14 +188,14 @@ func ComputeBreakAttributes(text []rune, attributes []Break) {
 
 	// start and end of the paragraph are always
 	// grapheme boundaries and word boundaries
-	attributes[0] |= GraphemeBoundary | WordBoundary         // Rule GB1 and WB1
-	attributes[len(text)] |= GraphemeBoundary | WordBoundary // Rule GB2 and WB2
+	attributes[0] |= graphemeBoundary | wordBoundary         // Rule GB1 and WB1
+	attributes[len(text)] |= graphemeBoundary | wordBoundary // Rule GB2 and WB2
 
 	// never break before the first char,
 	// but always break after the last
-	attributes[0] &^= LineBoundary                 // Rule LB2
-	attributes[len(text)] |= LineBoundary          // Rule LB3
-	attributes[len(text)] |= MandatoryLineBoundary // Rule LB3
+	attributes[0] &^= lineBoundary                 // Rule LB2
+	attributes[len(text)] |= lineBoundary          // Rule LB3
+	attributes[len(text)] |= mandatoryLineBoundary // Rule LB3
 }
 
 // Segmenter is the entry point of the package.
@@ -210,36 +212,36 @@ type Segmenter struct {
 	text []rune
 	// with length len(text) + 1 :
 	// the attribute at indice i is about the
-	// rune at i-1 and i
-	// See also `computeAttributes`
+	// rune at i-1 and i.
+	// See also [ComputeBreakAttributes]
 	// Example :
 	// 	text : 			[b, 		u, 	l, 	l]
 	// 	attributes :	[<start> b, b u, u l, l l, l <end>]
-	attributes []Break
+	attributes []breakAttr
 }
 
 // Init resets the segmenter storage with the given input,
 // and computes the attributes required to segment the text.
 func (seg *Segmenter) Init(paragraph []rune) {
 	seg.text = append(seg.text[:0], paragraph...)
-	seg.attributes = append(seg.attributes[:0], make([]Break, len(paragraph)+1)...)
-	ComputeBreakAttributes(seg.text, seg.attributes)
+	seg.attributes = append(seg.attributes[:0], make([]breakAttr, len(paragraph)+1)...)
+	computeBreakAttributes(seg.text, seg.attributes)
 }
 
 // attributeIterator is an helper type used to
 // handle iterating over a slice of runeAttr
 type attributeIterator struct {
 	src       *Segmenter
-	pos       int   // the current position in the input slice
-	lastBreak int   // the start of the current segment
-	flag      Break // break where this flag is on
+	pos       int       // the current position in the input slice
+	lastBreak int       // the start of the current segment
+	flag      breakAttr // break where this flag is on
 }
 
 // next returns true if there is still a segment to process,
 // and advances the iterator; or return false.
-// if returning true, the segment it at li.lastBreak:li.pos
+// if returning true, the segment is at [iter.lastBreak:iter.pos]
 func (iter *attributeIterator) next() bool {
-	iter.lastBreak = iter.pos // remember the start of the next line
+	iter.lastBreak = iter.pos // remember the start of the next segment
 	iter.pos++
 	for iter.pos <= len(iter.src.text) {
 		// can we break before i ?
@@ -251,6 +253,17 @@ func (iter *attributeIterator) next() bool {
 	return false
 }
 
+// Line is the content of a line delimited by the segmenter.
+type Line struct {
+	// Text is a subslice of the original input slice, containing the delimited line
+	Text []rune
+	// Offset is the start of the line in the input rune slice
+	Offset int
+	// IsMandatoryBreak is true if breaking (at the end of the line)
+	// is mandatory
+	IsMandatoryBreak bool
+}
+
 // LineIterator provides a convenient way of
 // iterating over the lines delimited by a `Segmenter`.
 type LineIterator struct {
@@ -266,25 +279,22 @@ func (li *LineIterator) Line() Line {
 	return Line{
 		Offset:           li.lastBreak,
 		Text:             li.src.text[li.lastBreak:li.pos], // pos is not included since we break right before
-		IsMandatoryBreak: li.src.attributes[li.pos]&MandatoryLineBoundary != 0,
+		IsMandatoryBreak: li.src.attributes[li.pos]&mandatoryLineBoundary != 0,
 	}
 }
 
-// Line is the content of a line delimited by the segmenter.
-type Line struct {
-	// Text is a subslice of the original input slice, containing the delimited line
-	Text []rune
-	// Offset is the start of the line in the input rune slice
-	Offset int
-	// IsMandatoryBreak is true if breaking (at the end of the line)
-	// is mandatory
-	IsMandatoryBreak bool
-}
-
 // LineIterator returns an iterator on the lines
 // delimited in [Init].
 func (sg *Segmenter) LineIterator() *LineIterator {
-	return &LineIterator{attributeIterator: attributeIterator{src: sg, flag: LineBoundary}}
+	return &LineIterator{attributeIterator: attributeIterator{src: sg, flag: lineBoundary}}
+}
+
+// Grapheme is the content of a grapheme delimited by the segmenter.
+type Grapheme struct {
+	// Text is a subslice of the original input slice, containing the delimited grapheme
+	Text []rune
+	// Offset is the start of the grapheme in the input rune slice
+	Offset int
 }
 
 // GraphemeIterator provides a convenient way of
@@ -305,16 +315,70 @@ func (gr *GraphemeIterator) Grapheme() Grapheme {
 	}
 }
 
-// Line is the content of a grapheme delimited by the segmenter.
-type Grapheme struct {
-	// Text is a subslice of the original input slice, containing the delimited grapheme
+// GraphemeIterator returns an iterator over the graphemes
+// delimited in [Init].
+func (sg *Segmenter) GraphemeIterator() *GraphemeIterator {
+	return &GraphemeIterator{attributeIterator: attributeIterator{src: sg, flag: graphemeBoundary}}
+}
+
+// Word is the content of a word delimited by the segmenter.
+//
+// More precisely, a word is formed by runes
+// with the [Alphabetic] property, or with a General_Category of Number,
+// delimited by the Word Boundary Unicode Property.
+//
+// See also https://unicode.org/reports/tr29/#Word_Boundary_Rules,
+// http://unicode.org/reports/tr44/#Alphabetic and
+// http://unicode.org/reports/tr44/#General_Category_Values
+type Word struct {
+	// Text is a subslice of the original input slice, containing the delimited word
 	Text []rune
-	// Offset is the start of the grapheme in the input rune slice
+	// Offset is the start of the word in the input rune slice
 	Offset int
 }
 
-// GraphemeIterator returns an iterator over the graphemes
+type WordIterator struct {
+	attributeIterator
+
+	inWord bool // true if we have seen the start of a word
+}
+
+// Next returns true if there is still a word to process,
+// and advances the iterator; or return false.
+func (gr *WordIterator) Next() bool {
+	hasBoundary := gr.next()
+	if !hasBoundary {
+		return false
+	}
+
+	if gr.inWord { // we are have reached the END of a word
+		gr.inWord = false
+		return true
+	}
+
+	// do we start a word ? if so, mark it
+	if gr.pos < len(gr.src.text) {
+		gr.inWord = unicode.Is(ucd.Word, gr.src.text[gr.pos])
+	}
+	// in any case, advance again
+	return gr.Next()
+}
+
+// Word returns the current `Word`
+func (gr *WordIterator) Word() Word {
+	return Word{
+		Offset: gr.lastBreak,
+		Text:   gr.src.text[gr.lastBreak:gr.pos],
+	}
+}
+
+// WordIterator returns an iterator over the word
 // delimited in [Init].
-func (sg *Segmenter) GraphemeIterator() *GraphemeIterator {
-	return &GraphemeIterator{attributeIterator: attributeIterator{src: sg, flag: GraphemeBoundary}}
+func (sg *Segmenter) WordIterator() *WordIterator {
+	// check is we start at a word
+	inWord := false
+	if len(sg.text) != 0 {
+		inWord = unicode.Is(ucd.Word, sg.text[0])
+	}
+	return &WordIterator{attributeIterator: attributeIterator{src: sg, flag: wordBoundary}, inWord: inWord}
 }
diff --git a/segmenter/segmenter_test.go b/segmenter/segmenter_test.go
@@ -41,11 +41,21 @@ func collectGraphemes(s *Segmenter, input []rune) []string {
 	return out
 }
 
+func collectWords(s *Segmenter, input []rune) []string {
+	s.Init(input)
+	iter := s.WordIterator()
+	var out []string
+	for iter.Next() {
+		out = append(out, string(iter.Word().Text))
+	}
+	return out
+}
+
 func collectWordBoundaries(s *Segmenter, input []rune) []bool {
 	s.Init(input)
 	out := make([]bool, len(s.attributes))
 	for i, a := range s.attributes {
-		out[i] = a&WordBoundary != 0
+		out[i] = a&wordBoundary != 0
 	}
 	return out
 }
@@ -156,6 +166,23 @@ func TestWordBreakUnicodeReference(t *testing.T) {
 	}
 }
 
+func TestWordSegmenter(t *testing.T) {
+	var seg Segmenter
+	for _, test := range []struct {
+		input string
+		words []string
+	}{
+		{"My name is Cris", []string{"My", "name", "is", "Cris"}},
+		{"Je m'appelle Benoit.", []string{"Je", "m'appelle", "Benoit"}},
+		{"Hi : nice ?! suit !", []string{"Hi", "nice", "suit"}},
+	} {
+		got := collectWords(&seg, []rune(test.input))
+		if !reflect.DeepEqual(test.words, got) {
+			t.Errorf("for %s, expected %v, got %v", test.input, test.words, got)
+		}
+	}
+}
+
 func lineSegmentCount(s *Segmenter, input []rune) int {
 	s.Init(input)
 	iter := s.LineIterator()