Skip to content

Commit

Permalink
[shaping] add segmenting according to direction, script and face
Browse files Browse the repository at this point in the history
  • Loading branch information
benoitkugler committed Nov 24, 2023
1 parent bd3ecfd commit 1633d0e
Show file tree
Hide file tree
Showing 3 changed files with 429 additions and 71 deletions.
253 changes: 182 additions & 71 deletions shaping/input.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,70 +104,7 @@ func SplitByFontGlyphs(input Input, availableFaces []font.Face) []Input {
// the return value of the `Fontmap.ResolveFace` call.
// The 'Face' field of 'input' is ignored: only 'availableFaces' is used to select the face.
func SplitByFace(input Input, availableFaces Fontmap) []Input {
var splitInputs []Input
currentInput := input
for i := input.RunStart; i < input.RunEnd; i++ {
r := input.Text[i]
if currentInput.Face != nil && ignoreFaceChange(r) {
// add the rune to the current input
continue
}

// select the first font supporting r
selectedFace := availableFaces.ResolveFace(r)

if currentInput.Face == selectedFace {
// add the rune to the current input
continue
}

// new face needed

if i != input.RunStart {
// close the current input ...
currentInput.RunEnd = i
// ... add it to the output ...
splitInputs = append(splitInputs, currentInput)
}

// ... and create a new one
currentInput = input
currentInput.RunStart = i
currentInput.Face = selectedFace
}

// close and add the last input
currentInput.RunEnd = input.RunEnd
splitInputs = append(splitInputs, currentInput)
return splitInputs
}

// ignoreFaceChange returns `true` is the given rune should not trigger
// a change of font.
//
// We don't want space characters to affect font selection; in general,
// it's always wrong to select a font just to render a space.
// We assume that all fonts have the ASCII space, and for other space
// characters if they don't, HarfBuzz will compatibility-decompose them
// to ASCII space...
//
// We don't want to change fonts for line or paragraph separators.
//
// Finaly, we also don't change fonts for what Harfbuzz consider
// as ignorable (however, some Control Format runes like 06DD are not ignored).
//
// The rationale is taken from pango : see bugs
// https://bugzilla.gnome.org/show_bug.cgi?id=355987
// https://bugzilla.gnome.org/show_bug.cgi?id=701652
// https://bugzilla.gnome.org/show_bug.cgi?id=781123
// for more details.
func ignoreFaceChange(r rune) bool {
return unicode.Is(unicode.Cc, r) || // control
unicode.Is(unicode.Cs, r) || // surrogate
unicode.Is(unicode.Zl, r) || // line separator
unicode.Is(unicode.Zp, r) || // paragraph separator
(unicode.Is(unicode.Zs, r) && r != '\u1680') || // space separator != OGHAM SPACE MARK
harfbuzz.IsDefaultIgnorable(r)
return splitByFace(input, availableFaces, nil)
}

// Segmenter holds a state used to split input
Expand All @@ -178,19 +115,24 @@ type Segmenter struct {
// which are alternatively considered as input/output of the segmentation
buffer1, buffer2 []Input

// used to handle Common script
delimStack []delimEntry

// buffer used for bidi segmentation
bidiParagraph bidi.Paragraph
}

// used to handle Common script
parenthesisStack []int
type delimEntry struct {
index int // in the [pairedDelims] list
script language.Script // resolved from the context
}

// Split segments the given [text] according to :
// - text direction
// - script
// - face, as defined by [faces]
//
// As a consequence, the following fields of the returned runs are set :
// As a consequence, it sets the following fields of the returned runs :
// - Text, RunStart, RunEnd
// - Direction
// - Script
Expand All @@ -205,7 +147,7 @@ func (seg *Segmenter) Split(text []rune, faces Fontmap, defaultDirection di.Dire
seg.reset()
seg.splitByBidi(text, defaultDirection)
seg.splitByScript()
seg.splitByFace()
seg.splitByFace(faces)
return seg.buffer1
}

Expand All @@ -221,8 +163,10 @@ func (seg *Segmenter) reset() {
}
seg.buffer1 = seg.buffer1[:0]
seg.buffer2 = seg.buffer2[:0]

// bidiParagraph is reset when using SetString
seg.parenthesisStack = seg.parenthesisStack[:0]

seg.delimStack = seg.delimStack[:0]
}

// fills buffer1
Expand Down Expand Up @@ -269,8 +213,175 @@ func (seg *Segmenter) splitByBidi(text []rune, defaultDirection di.Direction) {
}
}

// lookupDelimIndex binary searches in the list of the paired delimiters,
// and returns -1 if `ch` is not found
func lookupDelimIndex(ch rune) int {
lower := 0
upper := len(pairedDelims) - 1

for lower <= upper {
mid := (lower + upper) / 2

if ch < pairedDelims[mid] {
upper = mid - 1
} else if ch > pairedDelims[mid] {
lower = mid + 1
} else {
return mid
}
}

return -1
}

// uses buffer1 as input and fills buffer2
func (seg *Segmenter) splitByScript() {}
//
// See https://unicode.org/reports/tr24/#Common for reference
func (seg *Segmenter) splitByScript() {
for _, input := range seg.buffer1 {
currentInput := input
currentInput.Script = language.Common

for i := input.RunStart; i < input.RunEnd; i++ {
r := input.Text[i]
rScript := language.LookupScript(r)

// to properly handle Common script,
// we register paired delimiters

delimIndex := -1
if rScript == language.Common {
delimIndex = lookupDelimIndex(r)
}

if delimIndex >= 0 { // handle paired characters
if delimIndex%2 == 0 {
// this is an open character : push it onto the stack
seg.delimStack = append(seg.delimStack, delimEntry{delimIndex, currentInput.Script})
} else {
// this is a close character : try to look backward in the stack
// for its counterpart
counterPartIndex := delimIndex - 1
j := len(seg.delimStack) - 1
for ; j >= 0; j-- {
if seg.delimStack[j].index == counterPartIndex { // found a match, use its script
rScript = seg.delimStack[j].script
break
}
}
// in any case, pop the open characters
if j == -1 {
j = 0
}
seg.delimStack = seg.delimStack[:j]
}
}

// check if we have a 'real' change of script, or not
if rScript == language.Common || rScript == currentInput.Script {
// no change
continue
} else if currentInput.Script == language.Common {
// update the pair stack to attribute the resolved script
for i := range seg.delimStack {
seg.delimStack[i].script = rScript
}
// set the resolved script to the current run,
// but do NOT create a new run
currentInput.Script = rScript
} else {
// split to a new run
if i != input.RunStart { // push the existing one
currentInput.RunEnd = i
seg.buffer2 = append(seg.buffer2, currentInput)
}

currentInput.RunStart = i
currentInput.Script = rScript
}
}
// close and add the last input
currentInput.RunEnd = input.RunEnd
seg.buffer2 = append(seg.buffer2, currentInput)
}
}

// uses buffer2 as input, resets and fills buffer1
func (seg *Segmenter) splitByFace() {}
func (seg *Segmenter) splitByFace(faces Fontmap) {
seg.buffer1 = seg.buffer1[:0]
for _, input := range seg.buffer2 {
seg.buffer1 = splitByFace(input, faces, seg.buffer1)
}
}

func splitByFace(input Input, availableFaces Fontmap, buffer []Input) []Input {
currentInput := input
for i := input.RunStart; i < input.RunEnd; i++ {
r := input.Text[i]
if ignoreFaceChange(r) {
// add the rune to the current input
continue
}

// select the first font supporting r
selectedFace := availableFaces.ResolveFace(r)

// now that we have a font, apply it back,
// but do NOT create a new run
if currentInput.Face == nil {
currentInput.Face = selectedFace
}

if currentInput.Face == selectedFace {
// add the rune to the current input
continue
}

// new face needed

if i != input.RunStart {
// close the current input ...
currentInput.RunEnd = i
// ... add it to the output ...
buffer = append(buffer, currentInput)
}

// ... and create a new one
currentInput = input
currentInput.RunStart = i
currentInput.Face = selectedFace
}

// close and add the last input
currentInput.RunEnd = input.RunEnd
buffer = append(buffer, currentInput)
return buffer
}

// ignoreFaceChange returns `true` is the given rune should not trigger
// a change of font.
//
// We don't want space characters to affect font selection; in general,
// it's always wrong to select a font just to render a space.
// We assume that all fonts have the ASCII space, and for other space
// characters if they don't, HarfBuzz will compatibility-decompose them
// to ASCII space...
//
// We don't want to change fonts for line or paragraph separators.
//
// Finaly, we also don't change fonts for what Harfbuzz consider
// as ignorable (however, some Control Format runes like 06DD are not ignored).
//
// The rationale is taken from pango : see bugs
// https://bugzilla.gnome.org/show_bug.cgi?id=355987
// https://bugzilla.gnome.org/show_bug.cgi?id=701652
// https://bugzilla.gnome.org/show_bug.cgi?id=781123
// for more details.
func ignoreFaceChange(r rune) bool {
return unicode.Is(unicode.Cc, r) || // control
unicode.Is(unicode.Cs, r) || // surrogate
unicode.Is(unicode.Zl, r) || // line separator
unicode.Is(unicode.Zp, r) || // paragraph separator
(unicode.Is(unicode.Zs, r) && r != '\u1680') || // space separator != OGHAM SPACE MARK
harfbuzz.IsDefaultIgnorable(r)
}
Loading

0 comments on commit 1633d0e

Please sign in to comment.