~mna/runes

d10eed951281ce7c4386213b7b1fa2f3d31d9b69 — Martin Angers 1 year, 2 months ago fbeffa8
try decoding the args
4 files changed, 371 insertions(+), 137 deletions(-)

M main.go
A print.go
A table.go
A table_test.go
M main.go => main.go +66 -137
@@ 4,20 4,12 @@
package main

import (
	"bufio"
	"encoding/json"
	"flag"
	"fmt"
	"io"
	"os"
	"sort"
	"strconv"
	"strings"
	"unicode"
	"unicode/utf16"
	"unicode/utf8"

	"github.com/mattn/go-runewidth"
	"golang.org/x/text/unicode/runenames"
)

func main() {


@@ 28,7 20,6 @@ func main() {
	)
	flag.Usage = usage
	flag.Parse()
	_ = flagJSON

	if *flagHelp || *flagLHelp {
		help()


@@ 42,152 33,90 @@ func main() {
		p = &textPrinter{}
	}

	start, end := rune(12000), rune(13000)
	//args := flag.Args()
	args := flag.Args()
	var rs []rune
	for _, arg := range args {
		if len(arg) == 0 {
			continue
		}

		switch p0 := arg[0]; p0 {
		case 'u', 'U', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
			// it is either a number or a range
			parts := []string{arg}
			if rangeIx := strings.Index(arg, "-"); rangeIx >= 0 {
				parts = []string{arg[:rangeIx], arg[rangeIx+1:]}
			}

		default:
			rs = append(rs, runesSet(arg)...)
		}
	}

	if err := p.printStart(os.Stdout); err != nil {
		fmt.Fprintln(os.Stderr, err)
		os.Exit(1)
	}
	printRange(p, start, end)

	if err := p.printEnd(); err != nil {
		fmt.Fprintln(os.Stderr, err)
		os.Exit(1)
	}
}

type printer interface {
	printStart(w io.Writer) error
	printRune(runeInfo) error
	printEnd() error
}

type textPrinter struct {
	bw *bufio.Writer
}

func (tp *textPrinter) printStart(w io.Writer) error {
	tp.bw = bufio.NewWriter(w)
	return nil
}

func (tp *textPrinter) printRune(ri runeInfo) error {
	var catgs string
	if ri.Valid {
		catgs = fmt.Sprintf("%v", ri.Categories)
	} else {
		catgs = "[!]"
	}
	fmt.Fprintf(tp.bw, "%-7s", catgs)

	wd := runewidth.RuneWidth(ri.Rune)
	rn := fmt.Sprintf("%#U", ri.Rune)
	if n := len(rn) + wd - 1; n < 15 {
		rn += strings.Repeat(" ", 15-n)
	}
	fmt.Fprintf(tp.bw, "%s", rn)

	u8 := fmt.Sprintf("[% X]", ri.UTF8)
	fmt.Fprintf(tp.bw, "%-12s", u8)

	var u16 string
	if len(ri.UTF16) == 2 {
		u16 = fmt.Sprintf("[%X %X]", ri.UTF16[0], ri.UTF16[1])
	} else {
		u16 = fmt.Sprintf("[%X]", ri.UTF16[0])
// decode a command-line argument into a list of runes to print,
// and return true as second value if this is a range in the form
// <start>-<end> (inclusive).
func decode(arg string) (runes []rune, isRange bool, err error) {
	if len(arg) == 0 {
		return nil, false, nil
	}
	fmt.Fprintf(tp.bw, "%-12s", u16)

	fmt.Fprintln(tp.bw, ri.Name)
	return nil
}

func (tp *textPrinter) printEnd() error {
	return tp.bw.Flush()
}

type jsonPrinter struct {
	ris []runeInfo
	bw  *bufio.Writer
}

func (jp *jsonPrinter) printStart(w io.Writer) error {
	jp.bw = bufio.NewWriter(w)
	jp.ris = make([]runeInfo, 0, 1024) // TODO: receive a size hint
	return nil
}

func (jp *jsonPrinter) printRune(ri runeInfo) error {
	ri.UTF8JSON = make([]uint16, len(ri.UTF8))
	for i, b := range ri.UTF8 {
		ri.UTF8JSON[i] = uint16(b)
	p0 := arg[0]
	base := 10
	start := 0
	switch p0 {
	case 'u', 'U':
		if len(arg) == 1 || arg[1] != '+' {
			return runesSet(arg), false, nil
		}
		base = 16
		start = 2 // skip u+
	case '0':
		if len(arg) > 1 && arg[1] == 'x' || arg[1] == 'X' {
			base = 16
			start = 2 // skip 0x
			break
		}
	case '1', '2', '3', '4', '5', '6', '7', '8', '9':
		// ok, decimal number
	default:
		return runesSet(arg), false, nil
	}
	jp.ris = append(jp.ris, ri)
	return nil
}

func (jp *jsonPrinter) printEnd() error {
	enc := json.NewEncoder(jp.bw)
	enc.SetIndent("", "  ")
	if err := enc.Encode(jp.ris); err != nil {
		return err
	num, err := strconv.ParseUint(arg[start:], base, 32)
	if err != nil {
		return nil, false, err
	}
	return jp.bw.Flush()
}

type runeInfo struct {
	Rune       rune     `json:"rune"`
	Name       string   `json:"name"`
	Valid      bool     `json:"valid"`
	Categories []string `json:"categories"`
	UTF8       []byte   `json:"-"`
	UTF16      []uint16 `json:"utf16"`
	UTF8JSON   []uint16 `json:"utf8"`
	runes = append(runes, rune(num))
	return runes, false, nil
}

func info(r rune) runeInfo {
	var buf [utf8.UTFMax]byte

	ri := runeInfo{Rune: r}
	ri.Valid = utf8.ValidRune(r)
	if !ri.Valid {
		return ri
	}

	ri.Name = runenames.Name(r)

	n := utf8.EncodeRune(buf[:], r)
	ri.UTF8 = buf[:n]
	r1, r2 := utf16.EncodeRune(r)
	if r1 == utf8.RuneError && r2 == utf8.RuneError {
		ri.UTF16 = []uint16{uint16(r)}
	} else {
		ri.UTF16 = []uint16{uint16(r1), uint16(r2)}
	}

	var cats []string
	for nm, rt := range unicode.Categories {
		if unicode.Is(rt, r) {
			cats = append(cats, nm)
		}
// returns a slice of runes where each distinct rune in arg is returned.
func runesSet(arg string) []rune {
	m := make(map[rune]bool)
	for _, r := range arg {
		m[r] = true
	}
	sort.Strings(cats)
	ri.Categories = cats
	return ri
}

func printRune(p printer, r rune) {
	ri := info(r)
	p.printRune(ri)
}

func printRange(p printer, start, end rune) {
	for i := start; i <= end; i++ {
		// for ranges, ignore invalid utf8 runes
		if !utf8.ValidRune(i) {
			continue
		}
		printRune(p, i)
	rs := make([]rune, 0, len(m))
	for k := range m {
		rs = append(rs, k)
	}
	sort.Slice(rs, func(l, r int) bool {
		lr, rr := rs[l], rs[r]
		return lr < rr
	})
	return rs
}

func usage() {

A print.go => print.go +161 -0
@@ 0,0 1,161 @@
package main

import (
	"bufio"
	"encoding/json"
	"fmt"
	"io"
	"sort"
	"strings"
	"unicode"
	"unicode/utf16"
	"unicode/utf8"

	"github.com/mattn/go-runewidth"
	"golang.org/x/text/unicode/runenames"
)

type printer interface {
	printStart(w io.Writer) error
	printRune(runeInfo) error
	printEnd() error
}

type runeInfo struct {
	Rune       rune     `json:"rune"`
	Name       string   `json:"name"`
	Valid      bool     `json:"valid"`
	Categories []string `json:"categories"`
	UTF8       []byte   `json:"-"`
	UTF16      []uint16 `json:"utf16"`
	UTF8JSON   []uint16 `json:"utf8"`
}

// return a filled runeInfo struct for that rune r.
func info(r rune) runeInfo {
	var buf [utf8.UTFMax]byte

	ri := runeInfo{Rune: r}
	ri.Valid = utf8.ValidRune(r)
	if !ri.Valid {
		return ri
	}

	ri.Name = runenames.Name(r)

	n := utf8.EncodeRune(buf[:], r)
	ri.UTF8 = buf[:n]
	r1, r2 := utf16.EncodeRune(r)
	if r1 == utf8.RuneError && r2 == utf8.RuneError {
		ri.UTF16 = []uint16{uint16(r)}
	} else {
		ri.UTF16 = []uint16{uint16(r1), uint16(r2)}
	}

	var cats []string
	for nm, rt := range unicode.Categories {
		if unicode.Is(rt, r) {
			cats = append(cats, nm)
		}
	}
	sort.Strings(cats)
	ri.Categories = cats
	return ri
}

// print a single rune.
func printRune(p printer, r rune) {
	ri := info(r)
	p.printRune(ri)
}

// print an explicit list of runes.
func printRunes(p printer, rs []rune) {
	for _, r := range rs {
		printRune(p, r)
	}
}

// print a range of runes, ignoring invalid ones.
func printRange(p printer, start, end rune) {
	for i := start; i <= end; i++ {
		// for ranges, ignore invalid utf8 runes
		if !utf8.ValidRune(i) {
			continue
		}
		printRune(p, i)
	}
}

type textPrinter struct {
	bw *bufio.Writer
}

func (tp *textPrinter) printStart(w io.Writer) error {
	tp.bw = bufio.NewWriter(w)
	return nil
}

func (tp *textPrinter) printRune(ri runeInfo) error {
	var catgs string
	if ri.Valid {
		catgs = fmt.Sprintf("%v", ri.Categories)
	} else {
		catgs = "[!]"
	}
	fmt.Fprintf(tp.bw, "%-7s", catgs)

	wd := runewidth.RuneWidth(ri.Rune)
	rn := fmt.Sprintf("%#U", ri.Rune)
	if n := len(rn) + wd - 1; n < 15 {
		rn += strings.Repeat(" ", 15-n)
	}
	fmt.Fprintf(tp.bw, "%s", rn)

	u8 := fmt.Sprintf("[% X]", ri.UTF8)
	fmt.Fprintf(tp.bw, "%-12s", u8)

	var u16 string
	if len(ri.UTF16) == 2 {
		u16 = fmt.Sprintf("[%X %X]", ri.UTF16[0], ri.UTF16[1])
	} else {
		u16 = fmt.Sprintf("[%X]", ri.UTF16[0])
	}
	fmt.Fprintf(tp.bw, "%-12s", u16)

	fmt.Fprintln(tp.bw, ri.Name)
	return nil
}

func (tp *textPrinter) printEnd() error {
	return tp.bw.Flush()
}

type jsonPrinter struct {
	ris []runeInfo
	bw  *bufio.Writer
}

func (jp *jsonPrinter) printStart(w io.Writer) error {
	jp.bw = bufio.NewWriter(w)
	jp.ris = make([]runeInfo, 0, 1024) // TODO: receive a size hint?
	return nil
}

func (jp *jsonPrinter) printRune(ri runeInfo) error {
	ri.UTF8JSON = make([]uint16, len(ri.UTF8))
	for i, b := range ri.UTF8 {
		ri.UTF8JSON[i] = uint16(b)
	}
	jp.ris = append(jp.ris, ri)
	return nil
}

func (jp *jsonPrinter) printEnd() error {
	enc := json.NewEncoder(jp.bw)
	enc.SetIndent("", "  ")
	if err := enc.Encode(jp.ris); err != nil {
		return err
	}
	return jp.bw.Flush()
}

A table.go => table.go +104 -0
@@ 0,0 1,104 @@
package main

import (
	"fmt"
	"strings"
	"unicode"
)

// table encodes the allowed Unicode code points on 17_408 uint64s (for 1_114_112 bits).
type table [17408]uint64

// set the runes.
func (t *table) set(rs ...rune) {
	for _, r := range rs {
		if r > unicode.MaxRune {
			panic(fmt.Sprintf("%#U is outside the Unicode range", r))
		}
		t[r/64] |= 1 << uint64(r%64)
	}
}

// unset the runes.
func (t *table) unset(rs ...rune) {
	for _, r := range rs {
		if r > unicode.MaxRune {
			panic(fmt.Sprintf("%#U is outside the Unicode range", r))
		}
		t[r/64] &^= 1 << uint64(r%64)
	}
}

// setRange sets all runes in [from, to] inclusively.
func (t *table) setRange(from, to rune) {
	t.setUnsetRange(from, to, true)
}

// unsetRange unsets all runes in [from, to] inclusively.
func (t *table) unsetRange(from, to rune) {
	t.setUnsetRange(from, to, false)
}

func (t *table) setUnsetRange(from, to rune, set bool) {
	if from > to {
		panic(fmt.Sprintf("from rune %#U is greater than to rune %#U", from, to))
	}
	if to > unicode.MaxRune {
		panic(fmt.Sprintf("%#U is outside the Unicode range", to))
	}
	rng := make([]rune, to-from+1)
	for i := from; i <= to; i++ {
		rng[i-from] = i
	}
	if set {
		t.set(rng...)
	} else {
		t.unset(rng...)
	}
}

// is returns true if r is set.
func (t *table) is(r rune) bool {
	if r > unicode.MaxRune {
		return false
	}
	return t[r/64]&(1<<uint64(r%64)) != 0
}

// String returns the string representation of the Unicode table.
func (t *table) String() string {
	var buf strings.Builder
	buf.WriteByte('[')

	var last rune = -1
	writeFromLastTo := func(end rune) {
		if buf.Len() > 1 {
			buf.WriteByte(',')
		}
		if last == end {
			fmt.Fprintf(&buf, "%#U", last)
		} else {
			fmt.Fprintf(&buf, "%#U-%#U", last, end)
		}
	}

	for i := rune(0); i <= unicode.MaxRune; i++ {
		if t.is(i) {
			if last == -1 {
				last = i
			}
			continue
		}
		if last == -1 {
			continue
		}
		writeFromLastTo(i - 1)
		last = -1
	}
	if last != -1 {
		writeFromLastTo(unicode.MaxRune)
	}

	buf.WriteByte(']')
	return buf.String()
}

A table_test.go => table_test.go +40 -0
@@ 0,0 1,40 @@
package main

import (
	"testing"
	"unicode"
)

func TestTable(t *testing.T) {
	var tbl table

	tbl.set([]rune("abcd")...)
	got := tbl.String()
	want := "[U+0061 'a'-U+0064 'd']"
	if got != want {
		t.Fatalf("want %s, got %s", want, got)
	}

	tbl.setRange('A', 'Z')
	tbl.unsetRange('M', 'Q')
	got = tbl.String()
	want = "[U+0041 'A'-U+004C 'L',U+0052 'R'-U+005A 'Z',U+0061 'a'-U+0064 'd']"
	if got != want {
		t.Fatalf("want %s, got %s", want, got)
	}

	tbl.setRange('╒', '╟')
	tbl.unsetRange('A', 'z')
	got = tbl.String()
	want = "[U+2552 '╒'-U+255F '╟']"
	if got != want {
		t.Fatalf("want %s, got %s", want, got)
	}

	tbl.set(unicode.MaxRune)
	got = tbl.String()
	want = "[U+2552 '╒'-U+255F '╟',U+10FFFF]"
	if got != want {
		t.Fatalf("want %s, got %s", want, got)
	}
}