~handlerug/handlebot

ref: 7a6c8b4a09a72ba2fb7880695c2a7211911c68fd handlebot/urlpreview/generic.go -rw-r--r-- 4.4 KiB
7a6c8b4aUmar Getagazov urlpreview: User agent overrides a month ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
package urlpreview

import (
	"bufio"
	"bytes"
	"context"
	"errors"
	"fmt"
	"io"
	"mime"
	"net/http"
	"net/url"
	"strconv"
	"strings"

	"github.com/dustin/go-humanize"
	"golang.org/x/net/html"
	"golang.org/x/text/encoding/htmlindex"
)

func (p *Previewer) generic(ctx context.Context, u *url.URL) (string, error) {
	req, _ := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
	if p.UAOverrides != nil && p.UAOverrides[u.Host] != "" {
		req.Header.Set("User-Agent", p.UAOverrides[u.Host])
	}
	resp, err := p.httpClient().Do(req)
	if err != nil {
		return "", err
	}
	defer resp.Body.Close()

	lengthStr := resp.Header.Get("Content-Length")
	mimeStr := resp.Header.Get("Content-Type")
	mediatype, _, err := mime.ParseMediaType(mimeStr)
	if err == nil && resp.StatusCode != http.StatusMethodNotAllowed &&
		mediatype != "text/html" && mediatype != "application/xhtml+xml" {
		length, err := strconv.ParseUint(lengthStr, 10, 64)
		if err != nil {
			return mediatype, nil
		}
		dHeader := resp.Header.Get("Content-Disposition")
		_, params, err := mime.ParseMediaType(dHeader)
		if err == nil && params["filename"] != "" {
			return fmt.Sprintf("%s (%s, %s)", params["filename"],
				mediatype, humanize.IBytes(length)), nil
		}
		return fmt.Sprintf("%s (%s)", mediatype, humanize.IBytes(length)), nil
	}

	if resp.StatusCode != http.StatusOK {
		return "", ErrBadResponse
	}

	lr := io.LimitedReader{R: resp.Body, N: 400 * 1024}
	var body io.Reader = bufio.NewReader(&lr)

	body, err = decodeHTML(body, resp.Header.Get("Content-Type"))
	if err != nil {
		return "", err
	}

	title, err := findHTMLTitle(body)
	if err != nil {
		return "", err
	}

	// XXX: I'm pretty sure there's some way of changing the suffix for
	// server administrators, so this is in no way reliable. It's kind of
	// dangerous because we may end up in a recursion. It's also two
	// requests, which is a bit inefficient, though negligible.
	if strings.HasSuffix(title, "- Invidious") {
		ytUrl := *u
		ytUrl.Host = "www.youtube.com"
		if res, err := p.Preview(ctx, &ytUrl); res != "" && err == nil {
			return res, err
		}
	}

	return strings.TrimSpace(title), nil
}

func parseContentType(contentType string) string {
	if _, params, err := mime.ParseMediaType(contentType); err == nil {
		if name, ok := params["charset"]; ok {
			return name
		}
	}
	return ""
}

func decodeHTML(body io.Reader, contentType string) (io.Reader, error) {
	// Try to find a charset in the Content-Type header
	charset := parseContentType(contentType)

	// Try to find <meta charset=""> tag in the first 1024 bytes
	if charset == "" {
		r := bufio.NewReader(body)
		if data, err := r.Peek(1024); err == nil || err == io.EOF {
			if name, err := findMetaCharset(bytes.NewReader(data)); err == nil {
				charset = name
			}
		}
	}

	// If none of the methods above succeeded, assume UTF-8
	if charset == "" {
		charset = "utf-8"
	}

	enc, err := htmlindex.Get(charset)
	if err != nil {
		// Unknown charset
		return nil, err
	}
	if name, _ := htmlindex.Name(enc); name != "utf-8" {
		body = enc.NewDecoder().Reader(body)
	}
	return body, nil
}

func findMetaCharset(body io.Reader) (charset string, err error) {
	z := html.NewTokenizer(body)
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			if errors.Is(z.Err(), io.EOF) {
				err = z.Err()
			}
			return
		case html.StartTagToken, html.SelfClosingTagToken:
			name, hasAttr := z.TagName()
			if string(name) == "meta" && hasAttr {
				var httpEquiv, content string
				for {
					key, val, moreAttr := z.TagAttr()
					switch string(key) {
					case "charset":
						charset = string(val)
						return
					case "http-equiv":
						httpEquiv = string(val)
					case "content":
						content = string(val)
					}
					if !moreAttr {
						break
					}
				}
				if strings.ToLower(httpEquiv) == "content-type" {
					charset = parseContentType(content)
					return
				}
			}
		}
	}
	return
}

func findHTMLTitle(body io.Reader) (title string, err error) {
	parsingTitle := false
	z := html.NewTokenizer(body)
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			if !errors.Is(z.Err(), io.EOF) {
				err = z.Err()
			}
			return
		case html.StartTagToken, html.EndTagToken:
			name, _ := z.TagName()
			if string(name) == "title" {
				if tt == html.StartTagToken {
					// Start buffering text tokens
					parsingTitle = true
				} else {
					// </title>, return the result
					return
				}
			}
		case html.TextToken:
			if parsingTitle {
				title += string(z.Text())
			}
		}
	}
}