~samwhited/xmpp

ref: e8c09b3ff1c1489c21d96a0f7f8f0e8728fc095a xmpp/uri/iri.go -rw-r--r-- 5.9 KiB
e8c09b3fSam Whited design: fix typo in design doc template 1 year, 4 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
// Copyright 2019 The Mellium Contributors.
// Use of this source code is governed by the BSD 2-clause
// license that can be found in the LICENSE file.

// Package uri parses XMPP URI and IRI's as defined in RFC 5122.
//
// It also provides easy access to query components defined in XEP-0147: XMPP
// URI Scheme Query Components and the XMPP URI/IRI Querytypes registry.
package uri // import "mellium.im/xmpp/uri"

import (
	"errors"
	"fmt"
	"net/url"
	"strings"
	"unicode/utf8"

	"mellium.im/xmpp/jid"
)

var (
	errBadScheme = errors.New("uri: expected scheme xmpp")
)

// URI is a parsed XMPP URI or IRI.
type URI struct {
	*url.URL

	// ToAddr is the recipient address.
	ToAddr jid.JID

	// AuthAddr is empty if we should perform an action as the currently
	// authenticated account or ask the uesr to input the account to use.
	// Otherwise it is the auth address if present in an xmpp:// URI or IRI.
	AuthAddr jid.JID

	// Action is the first query component without a value and normally determines
	// the action to take when handling the URI. For example, the query string
	// might be ?join to join a chatroom, or ?message to send a message.
	//
	// For more information see XEP-0147: XMPP URI Scheme Query Components.
	Action string
}

// TODO: encoding and escaping, see
// https://tools.ietf.org/html/rfc5122#section-2.7.2

// Parse parses rawuri into a URI structure.
func Parse(rawuri string) (*URI, error) {
	u, err := url.Parse(rawuri)
	if err != nil {
		return nil, err
	}

	if u.Scheme != "xmpp" {
		return nil, errBadScheme
	}

	uri := &URI{
		URL: u,
	}

	if u.Host != "" {
		// If an authentication address was provided (ie. the URI started with
		// `xmpp://'), parse it out and take the recipient address from the path.

		uri.AuthAddr, err = jid.New(u.User.Username(), u.Hostname(), "")
		if err != nil {
			return nil, err
		}
		if u.Path != "" {
			// Strip the root / and use the path as the JID.
			iri, err := toIRI(u.Path[1:], false)
			if err != nil {
				return nil, err
			}
			uri.ToAddr, err = jid.Parse(iri)
			if err != nil {
				return nil, err
			}
		}
	} else {
		// If no auth address was provided (ie. the URI started with `xmpp:') take
		// the recipient address from the opaque part and ignore the user info.
		iri, err := toIRI(u.Opaque, true)
		if err != nil {
			return nil, err
		}
		uri.ToAddr, err = jid.Parse(iri)
		if err != nil {
			return nil, err
		}
	}

	for k, v := range u.Query() {
		if len(v) == 0 || len(v) == 1 && v[0] == "" {
			uri.Action = k
			break
		}
	}

	return uri, err
}

// String reassembles the URI or IRI Into a valid IRI string.
func (u *URI) String() string {
	iri, _ := toIRI(u.URL.String(), true)
	return iri
}

// toIRI converts the URI to a valid IRI using the algorithm defined in RFC 3987
// §3.2.
// It does not validate that the input is a valid URI.
func toIRI(u string, needsUnescape bool) (string, error) {
	// 1.  Represent the URI as a sequence of octets in US-ASCII.
	//
	// 2.  Convert all percent-encodings ("%" followed by two hexadecimal
	//     digits) to the corresponding octets, except those corresponding
	//     to "%", characters in "reserved", and characters in US-ASCII not
	//     allowed in URIs.
	// TODO: using PathUnescape to create a new string is very inefficient, but
	// it's the only method available in the standard library for this.
	// In the future we should write an escape/unescaper that implements
	// "golang.org/x/text/transform".Transformer or simply appends to a buffer or
	// byte slice so that the next step can also be done in the same iteration
	// without creating yet another builder.
	var err error
	if needsUnescape {
		u, err = url.PathUnescape(u)
		if err != nil {
			return "", err
		}
	}

	// 3. Re-percent-encode any octet produced in step 2 that is not part
	//    of a strictly legal UTF-8 octet sequence.
	// 4. Re-percent-encode all octets produced in step 3 that in UTF-8
	//    represent characters that are not appropriate according to
	//    sections 2.2, 4.1, and 6.1.
	// 5. Interpret the resulting octet sequence as a sequence of characters
	//    encoded in UTF-8.
	u = escapeInvalidUTF8(u)

	return u, nil
}

// escapeInvalidUTF8 is like strings.ToValidUTF8 except that it replaces invalid
// UTF8 with % encoded versions of the invalid bytes instead of a fixed string.
func escapeInvalidUTF8(s string) string {
	// This function is a modified form of code copied from
	// go/src/strings/strings.go under the terms of Go's BSD license.
	// See the file LICENSE-GO for details.
	var b strings.Builder

	for i, c := range s {
		if !runeDisallowed(c, 1) {
			continue
		}

		r, wid := utf8.DecodeRuneInString(s[i:])
		if runeDisallowed(r, wid) {
			// 3 bytes in %AB.
			b.Grow(len(s) + 3*wid)
			_, err := b.WriteString(s[:i])
			if err != nil {
				panic(fmt.Errorf("error writing string to buffer: %w", err))
			}
			s = s[i:]
			break
		}
	}

	// Fast path for unchanged input
	if b.Cap() == 0 { // didn't call b.Grow above
		return s
	}

	for i := 0; i < len(s); {
		c := s[i]
		if c < utf8.RuneSelf {
			i++
			err := b.WriteByte(c)
			if err != nil {
				panic(fmt.Errorf("error writing byte to buffer: %w", err))
			}
			continue
		}
		r, wid := utf8.DecodeRuneInString(s[i:])
		if runeDisallowed(r, wid) {
			for j := 0; j < wid; j++ {
				fmt.Fprintf(&b, "%%%0X", s[i+j:i+j+1])
			}
			i += wid
			continue
		}
		_, err := b.WriteString(s[i : i+wid])
		if err != nil {
			panic(fmt.Errorf("error writing remaining string to buffer: %w", err))
		}
		i += wid
	}

	return b.String()
}

func runeDisallowed(r rune, wid int) bool {
	switch r {
	case utf8.RuneError:
		// the various utf8.Decode methods return wid==1 on invalid rune. 0 means
		// empty string, other values won't be returned.
		return wid == 1
	case '\u200e', '\u200f', '\u202a', '\u202b', '\u202d', '\u202e', '\u202c':
		// RFC 3987 §4.1:
		//
		//     IRIs MUST NOT contain bidirectional formatting characters (LRM, RLM,
		//     LRE, RLE, LRO, RLO, and PDF).
		return true
	}
	return false
}