~eliasnaur/gio

5c54268d4001eef33b15146f14d7b23bdcfd1905 — Chris Waldon 6 months ago 36e768e
widget: [API] implement UAX#29 grapheme clustering in text widgets

This commit teaches the text widgets how to position their cursor according to
grapheme cluster boundaries rather than rune boundaries. While this is more work,
the results better match the expectations of users. A "grapheme cluster" is a
user-perceived character that may be composed of arbitrarily many runes.

I chose to implement this within widgets for two reasons:

- grapheme cluster boundaries would be extremely difficult to encode within the
glyph stream returned by the text shaper
- not all text needs to be segmented, only text that can be interacted with

All mutation operations exposed by widget.Editor now work in terms of grapheme
clusters instead of runes.

Signed-off-by: Chris Waldon <christopher.waldon.dev@gmail.com>
4 files changed, 407 insertions(+), 19 deletions(-)

M widget/editor.go
M widget/index.go
M widget/index_test.go
M widget/text.go
M widget/editor.go => widget/editor.go +10 -5
@@ 742,18 742,21 @@ func (e *Editor) CaretCoords() f32.Point {
// direction to delete: positive is forward, negative is backward.
//
// If there is a selection, it is deleted and counts as a single rune.
func (e *Editor) Delete(runes int) {
func (e *Editor) Delete(graphemeClusters int) {
	e.initBuffer()
	if runes == 0 {
	if graphemeClusters == 0 {
		return
	}

	start, end := e.text.Selection()
	if start != end {
		runes -= sign(runes)
		graphemeClusters -= sign(graphemeClusters)
	}

	end += runes
	// Move caret by the target quantity of clusters.
	e.text.MoveCaret(0, graphemeClusters)
	// Get the new rune offsets of the selection.
	start, end = e.text.Selection()
	e.replace(start, end, "", true)
	// Reset xoff.
	e.text.MoveCaret(0, 0)


@@ 889,7 892,9 @@ func (e *Editor) replace(start, end int, s string, addHistory bool) int {

// MoveCaret moves the caret (aka selection start) and the selection end
// relative to their current positions. Positive distances moves forward,
// negative distances moves backward. Distances are in runes.
// negative distances moves backward. Distances are in grapheme clusters,
// which closely match what users perceive as "characters" even when the
// characters are multiple code points long.
func (e *Editor) MoveCaret(startDelta, endDelta int) {
	e.initBuffer()
	e.text.MoveCaret(startDelta, endDelta)

M widget/index.go => widget/index.go +74 -0
@@ 3,11 3,14 @@
package widget

import (
	"bufio"
	"image"
	"io"
	"math"
	"sort"

	"gioui.org/text"
	"github.com/go-text/typesetting/segmenter"
	"golang.org/x/image/math/fixed"
)



@@ 415,3 418,74 @@ func (g *glyphIndex) locate(viewport image.Rectangle, startRune, endRune int, re
	}
	return rects
}

// graphemeReader segments paragraphs of text into grapheme clusters.
type graphemeReader struct {
	segmenter.Segmenter
	graphemes  []int
	paragraph  []rune
	source     io.ReaderAt
	cursor     int64
	reader     *bufio.Reader
	runeOffset int
}

// SetSource configures the reader to pull from source.
func (p *graphemeReader) SetSource(source io.ReaderAt) {
	p.source = source
	p.cursor = 0
	p.reader = bufio.NewReader(p)
	p.runeOffset = 0
}

// Read exists to satisfy io.Reader. It should not be directly invoked.
func (p *graphemeReader) Read(b []byte) (int, error) {
	n, err := p.source.ReadAt(b, p.cursor)
	p.cursor += int64(n)
	return n, err
}

// next decodes one paragraph of rune data.
func (p *graphemeReader) next() ([]rune, bool) {
	p.paragraph = p.paragraph[:0]
	var err error
	var r rune
	for err == nil {
		r, _, err = p.reader.ReadRune()
		if err != nil {
			break
		}
		p.paragraph = append(p.paragraph, r)
		if r == '\n' {
			break
		}
	}
	return p.paragraph, err == nil
}

// Graphemes will return the next paragraph's grapheme cluster boundaries,
// if any. If it returns an empty slice, there is no more data (all paragraphs
// have been segmented).
func (p *graphemeReader) Graphemes() []int {
	var more bool
	p.graphemes = p.graphemes[:0]
	p.paragraph, more = p.next()
	if len(p.paragraph) == 0 && !more {
		return nil
	}
	p.Segmenter.Init(p.paragraph)
	iter := p.Segmenter.GraphemeIterator()
	if iter.Next() {
		graph := iter.Grapheme()
		p.graphemes = append(p.graphemes,
			p.runeOffset+graph.Offset,
			p.runeOffset+graph.Offset+len(graph.Text),
		)
	}
	for iter.Next() {
		graph := iter.Grapheme()
		p.graphemes = append(p.graphemes, p.runeOffset+graph.Offset+len(graph.Text))
	}
	p.runeOffset += len(p.paragraph)
	return p.graphemes
}

M widget/index_test.go => widget/index_test.go +231 -0
@@ 1,6 1,8 @@
package widget

import (
	"bytes"
	"io"
	"testing"

	nsareg "eliasnaur.com/font/noto/sans/arabic/regular"


@@ 550,3 552,232 @@ func printGlyphs(t *testing.T, glyphs []text.Glyph) {
		t.Logf("glyphs[%2d] = {ID: 0x%013x, Flags: %4s, Advance: %4d(%6v), Runes: %d, Y: %3d, X: %4d(%6v)} ", i, g.ID, g.Flags, g.Advance, g.Advance, g.Runes, g.Y, g.X, g.X)
	}
}

func TestGraphemeReaderNext(t *testing.T) {
	latinDoc := bytes.NewReader([]byte(latinDocument))
	arabicDoc := bytes.NewReader([]byte(arabicDocument))
	emojiDoc := bytes.NewReader([]byte(emojiDocument))
	complexDoc := bytes.NewReader([]byte(complexDocument))
	type testcase struct {
		name  string
		input *bytes.Reader
		read  func() ([]rune, bool)
	}
	var pr graphemeReader
	for _, tc := range []testcase{
		{
			name:  "latin",
			input: latinDoc,
			read:  pr.next,
		},
		{
			name:  "arabic",
			input: arabicDoc,
			read:  pr.next,
		},
		{
			name:  "emoji",
			input: emojiDoc,
			read:  pr.next,
		},
		{
			name:  "complex",
			input: complexDoc,
			read:  pr.next,
		},
	} {
		t.Run(tc.name, func(t *testing.T) {
			pr.SetSource(tc.input)

			runes := []rune{}
			var paragraph []rune
			ok := true
			for ok {
				paragraph, ok = tc.read()
				if ok && len(paragraph) > 0 && paragraph[len(paragraph)-1] != '\n' {
				}
				for i, r := range paragraph {
					if i == len(paragraph)-1 {
						if r != '\n' && ok {
							t.Error("non-final paragraph does not end with newline")
						}
					} else if r == '\n' {
						t.Errorf("paragraph[%d] contains newline", i)
					}
				}
				runes = append(runes, paragraph...)
			}
			tc.input.Seek(0, 0)
			b, _ := io.ReadAll(tc.input)
			asRunes := []rune(string(b))
			if len(asRunes) != len(runes) {
				t.Errorf("expected %d runes, got %d", len(asRunes), len(runes))
			}
			for i := 0; i < max(len(asRunes), len(runes)); i++ {
				if i < min(len(asRunes), len(runes)) {
					if runes[i] != asRunes[i] {
						t.Errorf("expected runes[%d]=%d, got %d", i, asRunes[i], runes[i])
					}
				} else if i < len(asRunes) {
					t.Errorf("expected runes[%d]=%d, got nothing", i, asRunes[i])
				} else if i < len(runes) {
					t.Errorf("expected runes[%d]=nothing, got %d", i, runes[i])
				}
			}
		})
	}
}
func TestGraphemeReaderGraphemes(t *testing.T) {
	latinDoc := bytes.NewReader([]byte(latinDocument))
	arabicDoc := bytes.NewReader([]byte(arabicDocument))
	emojiDoc := bytes.NewReader([]byte(emojiDocument))
	complexDoc := bytes.NewReader([]byte(complexDocument))
	type testcase struct {
		name  string
		input *bytes.Reader
		read  func() []int
	}
	var pr graphemeReader
	for _, tc := range []testcase{
		{
			name:  "latin",
			input: latinDoc,
			read:  pr.Graphemes,
		},
		{
			name:  "arabic",
			input: arabicDoc,
			read:  pr.Graphemes,
		},
		{
			name:  "emoji",
			input: emojiDoc,
			read:  pr.Graphemes,
		},
		{
			name:  "complex",
			input: complexDoc,
			read:  pr.Graphemes,
		},
	} {
		t.Run(tc.name, func(t *testing.T) {
			pr.SetSource(tc.input)

			graphemes := []int{}
			for g := tc.read(); len(g) > 0; g = tc.read() {
				if len(graphemes) > 0 && g[0] != graphemes[len(graphemes)-1] {
					t.Errorf("expected first boundary in new paragraph %d to match final boundary in previous %d", g[0], graphemes[len(graphemes)-1])
				}
				if len(graphemes) > 0 {
					// Drop duplicated boundary.
					g = g[1:]
				}
				graphemes = append(graphemes, g...)
			}
			tc.input.Seek(0, 0)
			b, _ := io.ReadAll(tc.input)
			asRunes := []rune(string(b))
			if len(asRunes)+1 < len(graphemes) {
				t.Errorf("expected <= %d graphemes, got %d", len(asRunes)+1, len(graphemes))
			}
			for i := 0; i < len(graphemes)-1; i++ {
				if graphemes[i] >= graphemes[i+1] {
					t.Errorf("graphemes[%d](%d) >= graphemes[%d](%d)", i, graphemes[i], i+1, graphemes[i+1])
				}
			}
		})
	}
}
func BenchmarkGraphemeReaderNext(b *testing.B) {
	latinDoc := bytes.NewReader([]byte(latinDocument))
	arabicDoc := bytes.NewReader([]byte(arabicDocument))
	emojiDoc := bytes.NewReader([]byte(emojiDocument))
	complexDoc := bytes.NewReader([]byte(complexDocument))
	type testcase struct {
		name  string
		input *bytes.Reader
		read  func() ([]rune, bool)
	}
	pr := &graphemeReader{}
	for _, tc := range []testcase{
		{
			name:  "latin",
			input: latinDoc,
			read:  pr.next,
		},
		{
			name:  "arabic",
			input: arabicDoc,
			read:  pr.next,
		},
		{
			name:  "emoji",
			input: emojiDoc,
			read:  pr.next,
		},
		{
			name:  "complex",
			input: complexDoc,
			read:  pr.next,
		},
	} {
		var paragraph []rune = make([]rune, 4096)
		b.Run(tc.name, func(b *testing.B) {
			b.ResetTimer()
			for i := 0; i < b.N; i++ {
				pr.SetSource(tc.input)

				ok := true
				for ok {
					paragraph, ok = tc.read()
					_ = paragraph
				}
				_ = paragraph
			}
		})
	}
}
func BenchmarkGraphemeReaderGraphemes(b *testing.B) {
	latinDoc := bytes.NewReader([]byte(latinDocument))
	arabicDoc := bytes.NewReader([]byte(arabicDocument))
	emojiDoc := bytes.NewReader([]byte(emojiDocument))
	complexDoc := bytes.NewReader([]byte(complexDocument))
	type testcase struct {
		name  string
		input *bytes.Reader
		read  func() []int
	}
	pr := &graphemeReader{}
	for _, tc := range []testcase{
		{
			name:  "latin",
			input: latinDoc,
			read:  pr.Graphemes,
		},
		{
			name:  "arabic",
			input: arabicDoc,
			read:  pr.Graphemes,
		},
		{
			name:  "emoji",
			input: emojiDoc,
			read:  pr.Graphemes,
		},
		{
			name:  "complex",
			input: complexDoc,
			read:  pr.Graphemes,
		},
	} {
		b.Run(tc.name, func(b *testing.B) {
			b.ResetTimer()
			for i := 0; i < b.N; i++ {
				pr.SetSource(tc.input)
				for g := tc.read(); len(g) > 0; g = tc.read() {
					_ = g
				}
			}
		})
	}
}

M widget/text.go => widget/text.go +92 -14
@@ 17,6 17,7 @@ import (
	"gioui.org/op/paint"
	"gioui.org/text"
	"gioui.org/unit"
	"golang.org/x/exp/slices"
	"golang.org/x/image/math/fixed"
)



@@ 54,12 55,16 @@ type textView struct {
	// are accessed by Len, Text, and SetText.
	Mask rune

	font               text.Font
	shaper             *text.Shaper
	textSize           fixed.Int26_6
	seekCursor         int64
	rr                 textSource
	maskReader         maskReader
	font       text.Font
	shaper     *text.Shaper
	textSize   fixed.Int26_6
	seekCursor int64
	rr         textSource
	maskReader maskReader
	// graphemes tracks the indices of grapheme cluster boundaries within rr.
	graphemes []int
	// paragraphReader is used to populate graphemes.
	paragraphReader    graphemeReader
	lastMask           rune
	maxWidth, minWidth int
	viewSize           image.Point


@@ 163,12 168,43 @@ func (e *textView) closestToXY(x fixed.Int26_6, y int) combinedPos {
	return e.index.closestToXY(x, y)
}

func (e *textView) closestToXYGraphemes(x fixed.Int26_6, y int) combinedPos {
	// Find the closest existing rune position to the provided coordinates.
	pos := e.closestToXY(x, y)
	// Resolve cluster boundaries on either side of the rune position.
	firstOption := e.moveByGraphemes(pos.runes, 0)
	distance := 1
	if firstOption > pos.runes {
		distance = -1
	}
	secondOption := e.moveByGraphemes(firstOption, distance)
	// Choose the closest grapheme cluster boundary to the desired point.
	first := e.closestToRune(firstOption)
	firstDist := absFixed(first.x - x)
	second := e.closestToRune(secondOption)
	secondDist := absFixed(second.x - x)
	if firstDist > secondDist {
		return second
	} else {
		return first
	}
}

func absFixed(i fixed.Int26_6) fixed.Int26_6 {
	if i < 0 {
		return -i
	}
	return i
}

// MaxLines moves the cursor the specified number of lines vertically, ensuring
// that the resulting position is aligned to a grapheme cluster.
func (e *textView) MoveLines(distance int, selAct selectionAction) {
	caretStart := e.closestToRune(e.caret.start)
	x := caretStart.x + e.caret.xoff
	// Seek to line.
	pos := e.closestToLineCol(caretStart.lineCol.line+distance, 0)
	pos = e.closestToXY(x, pos.y)
	pos = e.closestToXYGraphemes(x, pos.y)
	e.caret.start = pos.runes
	e.caret.xoff = x - pos.x
	e.updateSelection(selAct)


@@ 399,10 435,12 @@ func (e *textView) scrollAbs(x, y int) {
	}
}

// MoveCoord moves the caret to the position closest to the provided
// point that is aligned to a grapheme cluster boundary.
func (e *textView) MoveCoord(pos image.Point) {
	x := fixed.I(pos.X + e.scrollOff.X)
	y := pos.Y + e.scrollOff.Y
	e.caret.start = e.closestToXY(x, y).runes
	e.caret.start = e.closestToXYGraphemes(x, y).runes
	e.caret.xoff = 0
}



@@ 431,9 469,16 @@ func (e *textView) layoutText(lt *text.Shaper) {
		for _, _, err := b.ReadRune(); err != io.EOF; _, _, err = b.ReadRune() {
			g, _ := it.processGlyph(text.Glyph{Runes: 1, Flags: text.FlagClusterBreak}, true)
			e.index.Glyph(g)

		}
	}
	e.paragraphReader.SetSource(e.rr)
	e.graphemes = e.graphemes[:0]
	for g := e.paragraphReader.Graphemes(); len(g) > 0; g = e.paragraphReader.Graphemes() {
		if len(e.graphemes) > 0 && g[0] == e.graphemes[len(e.graphemes)-1] {
			g = g[1:]
		}
		e.graphemes = append(e.graphemes, g...)
	}
	dims := layout.Dimensions{Size: it.bounds.Size()}
	dims.Baseline = dims.Size.Y - it.baseline
	e.dims = dims


@@ 521,44 566,74 @@ func (e *textView) Replace(start, end int, s string) int {
	return sc
}

// MovePages moves the caret position by vertical pages of text, ensuring that
// the final position is aligned to a grapheme cluster boundary.
func (e *textView) MovePages(pages int, selAct selectionAction) {
	caret := e.closestToRune(e.caret.start)
	x := caret.x + e.caret.xoff
	y := caret.y + pages*e.viewSize.Y
	pos := e.closestToXY(x, y)
	pos := e.closestToXYGraphemes(x, y)
	e.caret.start = pos.runes
	e.caret.xoff = x - pos.x
	e.updateSelection(selAct)
}

// moveByGraphemes returns the rune index resulting from moving the
// specified number of grapheme clusters from startRuneidx.
func (e *textView) moveByGraphemes(startRuneidx, graphemes int) int {
	if len(e.graphemes) == 0 {
		return startRuneidx
	}
	startGraphemeIdx, _ := slices.BinarySearch(e.graphemes, startRuneidx)
	startGraphemeIdx = max(startGraphemeIdx+graphemes, 0)
	startGraphemeIdx = min(startGraphemeIdx, len(e.graphemes)-1)
	startRuneIdx := e.graphemes[startGraphemeIdx]
	return e.closestToRune(startRuneIdx).runes
}

// clampCursorToGraphemes ensures that the final start/end positions of
// the cursor are on grapheme cluster boundaries.
func (e *textView) clampCursorToGraphemes() {
	e.caret.start = e.moveByGraphemes(e.caret.start, 0)
	e.caret.end = e.moveByGraphemes(e.caret.end, 0)
}

// MoveCaret moves the caret (aka selection start) and the selection end
// relative to their current positions. Positive distances moves forward,
// negative distances moves backward. Distances are in runes.
// negative distances moves backward. Distances are in grapheme clusters which
// better match the expectations of users than runes.
func (e *textView) MoveCaret(startDelta, endDelta int) {
	e.caret.xoff = 0
	e.caret.start = e.closestToRune(e.caret.start + startDelta).runes
	e.caret.end = e.closestToRune(e.caret.end + endDelta).runes
	e.caret.start = e.moveByGraphemes(e.caret.start, startDelta)
	e.caret.end = e.moveByGraphemes(e.caret.end, endDelta)
}

// MoveStart moves the caret to the start of the current line, ensuring that the resulting
// cursor position is on a grapheme cluster boundary.
func (e *textView) MoveStart(selAct selectionAction) {
	caret := e.closestToRune(e.caret.start)
	caret = e.closestToLineCol(caret.lineCol.line, 0)
	e.caret.start = caret.runes
	e.caret.xoff = -caret.x
	e.updateSelection(selAct)
	e.clampCursorToGraphemes()
}

// MoveEnd moves the caret to the end of the current line, ensuring that the resulting
// cursor position is on a grapheme cluster boundary.
func (e *textView) MoveEnd(selAct selectionAction) {
	caret := e.closestToRune(e.caret.start)
	caret = e.closestToLineCol(caret.lineCol.line, math.MaxInt)
	e.caret.start = caret.runes
	e.caret.xoff = fixed.I(e.maxWidth) - caret.x
	e.updateSelection(selAct)
	e.clampCursorToGraphemes()
}

// MoveWord moves the caret to the next word in the specified direction.
// Positive is forward, negative is backward.
// Absolute values greater than one will skip that many words.
// The final caret position will be aligned to a grapheme cluster boundary.
// BUG(whereswaldon): this method's definition of a "word" is currently
// whitespace-delimited. Languages that do not use whitespace to delimit
// words will experience counter-intuitive behavior when navigating by


@@ 598,6 673,7 @@ func (e *textView) MoveWord(distance int, selAct selectionAction) {
		}
	}
	e.updateSelection(selAct)
	e.clampCursorToGraphemes()
}

func (e *textView) ScrollToCaret() {


@@ 635,11 711,13 @@ func (e *textView) Selection() (start, end int) {
	return e.caret.start, e.caret.end
}

// SetCaret moves the caret to start, and sets the selection end to end. start
// SetCaret moves the caret to start, and sets the selection end to end. Then
// the two ends are clamped to the nearest grapheme cluster boundary. start
// and end are in runes, and represent offsets into the editor text.
func (e *textView) SetCaret(start, end int) {
	e.caret.start = e.closestToRune(start).runes
	e.caret.end = e.closestToRune(end).runes
	e.clampCursorToGraphemes()
}

// SelectedText returns the currently selected text (if any) from the editor,