~samwhited/xmpp

ref: de40e6f9107a5d25ae8c8edad9289fd0c4b12e2f xmpp/styling/styling.go -rw-r--r-- 17.7 KiB
de40e6f9Sam Whited roster: add type to IQ response in test 10 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
// Copyright 2020 The Mellium Contributors.
// Use of this source code is governed by the BSD 2-clause
// license that can be found in the LICENSE file.

//go:generate go run -tags=tools golang.org/x/tools/cmd/stringer -type=Style

// Package styling implements XEP-0393: Message Styling, a simple styling
// language.
//
// This package does not convert message styling documents into a format usable
// by any other rendering engine (ie. HTML or LaTeX), instead it tokenizes the
// input and provides you with a bitmask of styles that should be applied to
// each token.
//
// Format
//
// Message Styling borrows several familiar formatting options from Markdown
// (though it is much simpler and not compatible with Markdown parsers).
//
// Several inline formats exist:
//
// A string wrapped in _emph_ should be displayed emphasized (normally in
// italics), and strings wrapped in *strong* will have strong emphasis (normally
// bold). The	`pre` styling directive results in an inline pre-formatted string
// (monospace with no child formatting allowed), and the ~strike~ style should
// result in text with a line through the middle (strike through).
//
// There are also a few block formats such as a block quotation:
//
//     > A quotation
//     >> Can be multi-level
//
// And a preformatted text block which can have no child blocks, should be
// displayed in a monospace font, and which should not be re-wrapped:
//
//     ```an optional tag may go here.
//     Lines of pre-formatted text go here.
//     ```
package styling // import "mellium.im/xmpp/styling"

import (
	"bufio"
	"bytes"
	"io"
	"unicode"
	"unicode/utf8"
)

// Style is a bitmask that represents a set of styles that can be applied to
// text.
type Style uint32

// The style bits.
const (
	// BlockPre represents a preformatted text block.
	// It should be displayed in a monospace font with no change to line breaks.
	BlockPre Style = 1 << iota

	// BlockQuote represents a nestable quotation.
	// To get the level of the quotation see the Quote method.
	BlockQuote

	// SpanEmph is an inline span of text that should be displayed in italics.
	SpanEmph

	// SpanStrong is an inline span of text that should be displayed bold.
	SpanStrong

	// SpanStrike is an inline span of text that should be displayed with a
	// horizontal line through the middle (strike through).
	SpanStrike

	// SpanPre is an inline span of text that should be displayed in a monospace
	// font.
	SpanPre

	// Styling directive markers.
	// It is often desirable to distinguish the characters that triggered styling
	// from surrounding text. These bits are set only on styling directives, the
	// characters or sequences of characters that result in the style changing.
	// The corresponding style bit will also be set whenever the start or end bits
	// are set. For example, in *strong* the first "*" will have
	// SpanStrong|SpanStrongStart set, the "strong" will only have SpanStrong set,
	// and the last "*" will have SpanStrong|SpanStrongEnd set.
	BlockPreStart
	BlockPreEnd
	BlockQuoteStart
	BlockQuoteEnd
	SpanEmphStart
	SpanEmphEnd
	SpanStrongStart
	SpanStrongEnd
	SpanStrikeStart
	SpanStrikeEnd
	SpanPreStart
	SpanPreEnd

	// Various useful masks
	// These bitmasks are provided as a convenience to make it easy to check what
	// general category of styles are applied.
	Block               = BlockPre | BlockQuote
	Span                = SpanEmph | SpanStrong | SpanStrike | SpanPre
	SpanEmphDirective   = SpanEmphStart | SpanEmphEnd
	SpanStrongDirective = SpanStrongStart | SpanStrongEnd
	SpanStrikeDirective = SpanStrikeStart | SpanStrikeEnd
	SpanPreDirective    = SpanPreStart | SpanPreEnd
	SpanDirective       = SpanEmphDirective | SpanStrongDirective | SpanStrikeDirective | SpanPreDirective
	SpanStartDirective  = SpanEmphStart | SpanStrongStart | SpanStrikeStart | SpanPreStart
	SpanEndDirective    = SpanEmphEnd | SpanStrongEnd | SpanStrikeEnd | SpanPreEnd
	BlockPreDirective   = BlockPreStart | BlockPreEnd
	BlockQuoteDirective = BlockQuoteStart | BlockQuoteEnd
	BlockDirective      = BlockPreDirective | BlockQuoteDirective
	BlockStartDirective = BlockPreStart | BlockQuoteStart
	BlockEndDirective   = BlockPreEnd | BlockQuoteEnd
	Directive           = SpanDirective | BlockDirective
	StartDirective      = SpanStartDirective | BlockStartDirective
	EndDirective        = SpanEndDirective | BlockEndDirective
)

// A Token represents a styling directive or unstyled span of text.
// If the token is a preformatted text block start token, Info is a slice of the
// bytes between the code fence (```) and the newline.
type Token struct {
	Mask Style
	Data []byte
	Info []byte
}

// Copy creates a new copy of the token.
func (t Token) Copy() Token {
	data := make([]byte, len(t.Data))
	copy(data, t.Data)
	t.Data = data
	if t.Info != nil {
		info := make([]byte, len(t.Info))
		copy(info, t.Info)
		t.Info = info
	}
	return t
}

var fence = []byte{'`', '`', '`'}

func isSpace(r rune) bool {
	return unicode.IsSpace(r) || unicode.Is(unicode.Space, r)
}

// Scan returns a new stateful split function for a bufio.Scanner that splits on
// message styling tokens.
// This function is different from a Decoder in that it will not return block
// quote end tokens.
//
// This is a low-level building block of this package.
// Most users will want to use a Decoder instead.
func Scan() bufio.SplitFunc {
	return (&Decoder{}).scan
}

// A Decoder represents a styling lexer reading a particular input stream.
// The parser assumes that input is encoded in UTF-8.
type Decoder struct {
	s                *bufio.Scanner
	clearMask        Style
	mask             Style
	quoteSplit       *Decoder
	quoteStarted     bool
	lastNewline      bool
	hasRun           bool
	spanStack        []byte
	insertBlockClose bool
	bufferedToken    *Token
	lastError        error
}

// NewDecoder creates a new styling parser reading from r.
// If r does not implement io.ByteReader, NewDecoder will do its own buffering.
func NewDecoder(r io.Reader) *Decoder {
	d := &Decoder{
		s: bufio.NewScanner(r),
	}
	d.s.Split(d.scan)
	return d
}

// Token returns the next styling token in the input stream.
// Returned tokens do not always correspond directly to the input stream.
// For example, at the end of a block quote an empty token is returned with the
// mask BlockQuote|BlockQuoteEnd, but there is no explicit block quote
// terminator character in the input stream so its data will be empty.
//
// Slices of bytes in the returned token data refer to the parser's internal
// buffer and remain valid only until the next call to Token.
// To acquire a copy of the bytes call the token's Copy method.
func (d *Decoder) Token() Token {
	ret := d.bufferedToken
	d.bufferedToken = nil
	return *ret
}

// Next prepares the next styling token in the input stream for reading with
// the Token method.
// It returns true on success, or false if end of input stream is reached
// or an error happened while preparing it.
// Err should be consulted to distinguish between the two cases.
// Every call to Token, even the first one, must be preceded by a call to Next.
func (d *Decoder) Next() bool {
	if d.insertBlockClose {
		d.insertBlockClose = false
		d.bufferedToken = &Token{
			Mask: d.Style(),
			Data: d.s.Bytes(),
		}
		return true
	}

	prevLevel := d.Quote()
	if s := d.s.Scan(); !s {
		if err := d.s.Err(); err != nil {
			d.lastError = err
			return false
		}
		d.lastError = io.EOF
		return false
	}

	t := &Token{
		Mask: d.Style(),
		Data: d.s.Bytes(),
	}

	var newlineLen int
	if len(t.Data) > 0 && t.Data[len(t.Data)-1] == '\n' {
		newlineLen = 1
	}
	if t.Mask&BlockPreStart == BlockPreStart && len(t.Data) > len(fence)+newlineLen {
		t.Info = t.Data[len(fence) : len(t.Data)-newlineLen]
	}

	// If we've dropped a block quote level, insert a token to indicate that we're
	// at the end of the quote since block quotes have no explicit terminator.
	currLevel := d.Quote()
	if currLevel < prevLevel {
		d.insertBlockClose = true
		d.bufferedToken = &Token{Mask: d.Style()}
		return true
	}

	d.bufferedToken = t
	return true
}

// Err returns the error, if any, that was encountered while reading the input stream.
// In case of EOF, Err returns io.EOF
func (d *Decoder) Err() error {
	return d.lastError
}

// SkipSpan pops tokens from the decoder until it reaches the end of the current
// span, positioning the token stream at the beginning of the next span or
// block.
// If SkipSpan is called while no span is entered it behaves like SkipBlock.
// It returns true if it succeeds without errors, false otherwise
// Err should be consulted to check for errors.
func (d *Decoder) SkipSpan() bool {
	for prevLevel := d.Quote(); d.Next(); prevLevel = d.Quote() {
		tok := d.Token()

		switch {
		case tok.Mask&StartDirective&^BlockQuoteStart > 0 || (tok.Mask&BlockQuoteStart == BlockQuoteStart && d.Quote() > prevLevel):
			if !d.SkipSpan() {
				return false
			}
		case tok.Mask&EndDirective > 0 || (tok.Mask == 0 && d.lastNewline):
			return true
		}
	}
	if err := d.Err(); err != nil {
		return false
	}
	return true
}

// SkipBlock pops tokens from the decoder until it reaches the end of the
// current block, positioning the token stream at the beginning of the next
// block.
// It returns true if it succeeds without errors, false otherwise
// Err should be consulted to check for errors.
// If SkipBlock is called at the beginning of the input stream before any tokens
// have been popped or any blocks have been entered, it will skip the entire
// input stream as if everything were contained in an imaginary "root" block.
func (d *Decoder) SkipBlock() bool {
	for prevLevel := d.Quote(); d.Next(); prevLevel = d.Quote() {
		tok := d.Token()
		switch {
		case tok.Mask&BlockStartDirective&^BlockQuoteStart > 0 || (tok.Mask&BlockQuoteStart == BlockQuoteStart && d.Quote() > prevLevel):
			// If we're a start directive (other than starting a block quote), or
			// we're a block quote start directive that's deeper than the previous
			// level of block quote (ie. starting a new blockquote, not continuing an
			// existing one), recurse down into the inner block, skipping tokens.
			if !d.SkipBlock() {
				return false
			}
		case tok.Mask&BlockEndDirective > 0 || (tok.Mask == 0 && d.lastNewline):
			// If this is an end directive (or the end of a plain block), we're done
			// with skipping the current level, so end the current level of recursion.
			return true
		}
	}
	if err := d.Err(); err != nil {
		return false
	}
	return true
}

func (d *Decoder) scan(data []byte, atEOF bool) (advance int, token []byte, err error) {
	if atEOF && len(data) == 0 {
		return 0, nil, nil
	}
	defer func() {
		if len(token) > 0 && token[len(token)-1] == '\n' {
			d.lastNewline = true
		}
	}()
	if d.lastNewline {
		tmpSplit := d
		for {
			tmpSplit.quoteStarted = false
			tmpSplit.hasRun = false
			if tmpSplit.quoteSplit == nil {
				break
			}
			tmpSplit = tmpSplit.quoteSplit
		}
		d.mask &^= BlockQuote
		d.lastNewline = false
	}
	d.mask &^= d.clearMask
	d.clearMask = 0

	switch {
	case len(d.spanStack) > 0:
		// If we're already in a span, just keep parsing it.
		return d.scanSpan(data, atEOF)
	case d.mask&BlockPre == BlockPre:
		// If we're inside a preblock, ignore everything else and scan lines until
		// the end of the preblock.
		d.hasRun = true
		return d.scanPre(data, atEOF)
	}

	// Look for new blocks
	switch l := startsBlockQuote(data); {
	case l > 0 && !d.quoteStarted:
		// If we haven't yet consumed our block quote start token, do so.
		d.mask |= BlockQuote | BlockQuoteStart
		d.clearMask |= BlockQuoteStart
		// Setup the inner parser if we haven't done so already.
		if d.quoteSplit == nil {
			d.quoteSplit = &Decoder{}
		}
		d.quoteStarted = true
		d.hasRun = true
		return l, data[:l], nil
	case l > 0 && d.quoteStarted:
		// If we've already consumed a quote start token and we encounter another,
		// let the inner Decoder handle it.
		return d.quoteSplit.scan(data, atEOF)
	case l == 0 && d.quoteSplit != nil:
		// If we're already in a block quote, delegate to the inner Decoder.
		if d.quoteStarted {
			return d.quoteSplit.scan(data, atEOF)
		}
		// If we're not in a block and we couldn't find a block start at our
		// level, we've dropped a level so reset the inner Decoder.
		d.quoteSplit = nil
	}
	d.hasRun = true

	newLineIDX := bytes.IndexByte(data, '\n')
	if bytes.HasPrefix(data, fence) {
		switch {
		case newLineIDX > 0:
			// The full line is the codeblock start.
			d.mask |= BlockPre | BlockPreStart
			d.clearMask |= BlockPreStart
			return newLineIDX + 1, data[:newLineIDX+1], nil
		case atEOF:
			// We're at the end of the file so the token is the remainder of the data.
			d.mask |= BlockPre | BlockPreStart
			d.clearMask |= BlockPreStart
			return len(data), data, nil
		case newLineIDX == -1:
			return 0, nil, nil
		}
	}

	return d.scanSpan(data, atEOF)
}

// Quote is the blockquote depth at the current position in the document.
func (d *Decoder) Quote() uint {
	var level uint
	if d.quoteStarted {
		level = 1 + d.quoteSplit.Quote()
	}
	// If the next token is a virtual block quote end token, pretend we haven't
	// dropped down a level yet.
	if d.insertBlockClose {
		level++
	}
	return level
}

// Style returns a bitmask representing the currently applied styles at the
// current position in the document.
func (d *Decoder) Style() Style {
	if d.insertBlockClose {
		return BlockQuoteEnd | BlockQuote
	}
	if d.hasRun {
		if d.quoteSplit != nil {
			return d.mask | d.quoteSplit.Style()
		}
		return d.mask
	}
	return 0
}

// scanPre is a split function that's used inside pre blocks where we don't have
// to bother parsing any children.
// All it does is look for the end of the pre block and line breaks.
func (d *Decoder) scanPre(data []byte, atEOF bool) (advance int, token []byte, err error) {
	switch idx := bytes.Index(data, fence); {
	case idx == 0 && !atEOF && len(data) == len(fence):
		// We need to make sure it's followed by a newline, so get more data.
		return 0, nil, nil
	case idx == 0 && (atEOF || (len(data) > len(fence) && data[len(fence)] == '\n')):
		d.mask |= BlockPreEnd
		d.clearMask |= BlockPre | BlockPreEnd
		l := len(fence)
		if !atEOF {
			l++
		}
		return l, data[:l], nil
	}
	if atEOF {
		return len(data), data, nil
	}
	newLineIDX := bytes.IndexByte(data, '\n')
	if newLineIDX >= 0 {
		return newLineIDX + 1, data[:newLineIDX+1], nil
	}
	return 0, nil, nil
}

// scanSpan is a split function that finds and splits valid formatted spans.
func (d *Decoder) scanSpan(data []byte, atEOF bool) (advance int, token []byte, err error) {
	// Look for span styling directives
	startIDX := -1
	var startDirective byte

	for i, b := range data {
		// If we hit the end of the block without encountering any styling
		// directives, the whole span is a plain text span so just return it.
		if b == '\n' {
			return i + 1, data[:i+1], nil
		}

		if b != '*' && b != '_' && b != '`' && b != '~' {
			// This is not a styling directive. Nothing to do here.
			continue
		}

		nextRune, _ := utf8.DecodeRune(data[i+1:])
		nextSpace := isSpace(nextRune)
		prevRune, _ := utf8.DecodeLastRune(data[:i])
		prevSpace := isSpace(prevRune)

		switch {
		case len(d.spanStack) > 0 && b == d.spanStack[len(d.spanStack)-1]:
			// If this is an end directive that matches an outer spans start
			// directive, return the directive, or the rest of the span and clear the
			// mask.
			if i == 0 {
				switch b {
				case '*':
					d.mask |= SpanStrongEnd
					d.clearMask |= SpanStrong | SpanStrongEnd
				case '_':
					d.mask |= SpanEmphEnd
					d.clearMask |= SpanEmph | SpanEmphEnd
				case '~':
					d.mask |= SpanStrikeEnd
					d.clearMask |= SpanStrike | SpanStrikeEnd
				case '`':
					d.mask |= SpanPreEnd
					d.clearMask |= SpanPre | SpanPreEnd
				}
				d.spanStack = d.spanStack[:len(d.spanStack)-1]
				return 1, data[0:1], nil
			}

			// Return whatever remains inside the span.
			return i, data[:i], nil
		case startIDX == -1 && (d.mask&SpanPre == 0):
			// If we haven't found a start directive yet, this directive might be one
			// (unless we're already in a pre-span which doesn't allow children) so
			// check if it matches all the rules defined in XEP-0393 §6.2.
			// if we're at the start of the data (start of the line or just after a
			// previous styling directive) or we're immediately after a space, this
			// is a valid start styling directive.
			if (i == 0 || prevSpace) && !nextSpace {
				// Special case for an immediate closing directive (ie. "**").
				// There is almost certainly a better way to handle this that doesn't
				// require special casing it.
				if len(data) > i+1 && data[i+1] == b {
					continue
				}
				startIDX = i
				startDirective = b
				continue
			}
		case b == startDirective && !prevSpace && i > startIDX+1:
			// If we have already found a start directive, scan for a matching end
			// directive.

			// If we find one this is a valid span so return the previous plain span
			// (if any):
			if startIDX > 0 {
				return startIDX, data[:startIDX], nil
			}

			// Then add it to the stack:
			d.spanStack = append(d.spanStack, b)

			// or return the styling directive itself and set the mask.
			switch b {
			case '*':
				d.mask |= SpanStrong | SpanStrongStart
				d.clearMask |= SpanStrongStart
			case '_':
				d.mask |= SpanEmph | SpanEmphStart
				d.clearMask |= SpanEmphStart
			case '~':
				d.mask |= SpanStrike | SpanStrikeStart
				d.clearMask |= SpanStrikeStart
			case '`':
				d.mask |= SpanPre | SpanPreStart
				d.clearMask |= SpanPreStart
			}
			return 1, data[0:1], nil
		}
	}

	// If we didn't find a span, request new data until we find a span or newline.
	if atEOF {
		return len(data), data, nil
	}
	return 0, nil, nil
}

// startsBlockQuote looks for a block quote start token at the beginning of data
// and returns its length (or 0 if one was not found).
func startsBlockQuote(data []byte) int {
	if len(data) == 0 || data[0] != '>' {
		return 0
	}

	data = data[1:]
	l := 1
	for len(data) > 0 {
		r, size := utf8.DecodeRune(data)
		if !isSpace(r) {
			return l
		}
		l += size
		data = data[size:]
	}
	return l
}