~rjarry/aerc

8f15b804d1a760d4f4e5c0c47a8a26ee2c04a830 — Vitaly Ovchinnikov 12 days ago d57aa9e
hyperlinks: better parsing of emails without mailto prefixes

Add some new tests from the emails I have and make them work by
adjusting the code that looks for hyperlinks.

The idea is to treat "inline" emails (those without mailto:) a little
bit different and stop a little earlier while looking for their ends.

Signed-off-by: Vitaly Ovchinnikov <v@postbox.nz>
Acked-by: Robin Jarry <robin@jarry.cc>
2 files changed, 36 insertions(+), 1 deletions(-)

M lib/parse/hyperlinks.go
M lib/parse/hyperlinks_test.go
M lib/parse/hyperlinks.go => lib/parse/hyperlinks.go +16 -1
@@ 37,6 37,9 @@ func HttpLinks(r io.Reader) (io.Reader, []string) {
		scheme = j - i
		j = scheme

		// "inline" email without a mailto: prefix - add some extra checks for those
		inlineEmail := len(match) > 4 && match[2] == -1 && match[4] == -1

		for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 {
			switch b[j] {
			case '[':


@@ 69,9 72,21 @@ func HttpLinks(r io.Reader) (io.Reader, []string) {
				} else {
					j++
				}
			case '&':
				if inlineEmail {
					emitUrl = true
				} else {
					j++
				}
			default:
				j++
			}

			// we don't want those in inline emails
			if inlineEmail && (paren > 0 || ltgt > 0 || bracket > 0) {
				j--
				emitUrl = true
			}
		}

		// Heuristic to remove trailing characters that are


@@ 91,7 106,7 @@ func HttpLinks(r io.Reader) (io.Reader, []string) {
			continue
		}
		url := string(b[:j])
		if match[2] == -1 && match[4] == -1 {
		if inlineEmail {
			// Email address with missing mailto: scheme. Add it.
			url = "mailto:" + url
		}

M lib/parse/hyperlinks_test.go => lib/parse/hyperlinks_test.go +20 -0
@@ 114,6 114,26 @@ func TestHyperlinks(t *testing.T) {
			text:  "You can reach me via the somewhat strange, but nonetheless valid, email mailto:~mpldr/list@[2001:db8::7]?subject=whazzup%3F",
			links: []string{"mailto:~mpldr/list@[2001:db8::7]?subject=whazzup%3F"},
		},
		{
			name:  "simple email in <a href>",
			text:  `<a href="mailto:a@abc.com" rel="noopener noreferrer">`,
			links: []string{"mailto:a@abc.com"},
		},
		{
			name:  "simple email in <a> body",
			text:  `<a href="#" rel="noopener noreferrer">a@abc.com</a><br/><p>more text</p>`,
			links: []string{"mailto:a@abc.com"},
		},
		{
			name:  "emails in <a> href and body",
			text:  `<a href="mailto:a@abc.com" rel="noopener noreferrer">b@abc.com</a><br/><p>more text</p>`,
			links: []string{"mailto:a@abc.com", "mailto:b@abc.com"},
		},
		{
			name:  "email in &lt;...&gt;",
			text:  `<div>01.02.2023, 10:11, "Firstname Lastname" &lt;a@abc.com&gt;:</div>`,
			links: []string{"mailto:a@abc.com"},
		},
	}

	for i, test := range tests {