package urlpreview
import (
"bufio"
"bytes"
"context"
"errors"
"fmt"
"io"
"mime"
"net/http"
"net/url"
"strconv"
"strings"
"github.com/dustin/go-humanize"
"golang.org/x/net/html"
"golang.org/x/text/encoding/htmlindex"
)
func (p *Previewer) generic(ctx context.Context, u *url.URL) (string, error) {
req, _ := http.NewRequestWithContext(ctx, "GET", u.String(), nil)
resp, err := p.httpClient().Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
lengthStr := resp.Header.Get("Content-Length")
mimeStr := resp.Header.Get("Content-Type")
mediatype, _, err := mime.ParseMediaType(mimeStr)
if err == nil && resp.StatusCode != http.StatusMethodNotAllowed &&
mediatype != "text/html" && mediatype != "application/xhtml+xml" {
length, err := strconv.ParseUint(lengthStr, 10, 64)
if err != nil {
return mediatype, nil
}
dHeader := resp.Header.Get("Content-Disposition")
_, params, err := mime.ParseMediaType(dHeader)
if err == nil && params["filename"] != "" {
return fmt.Sprintf("%s (%s, %s)", params["filename"],
mediatype, humanize.IBytes(length)), nil
}
return fmt.Sprintf("%s (%s)", mediatype, humanize.IBytes(length)), nil
}
if resp.StatusCode != http.StatusOK {
return "", ErrBadResponse
}
lr := io.LimitedReader{R: resp.Body, N: 400 * 1024}
var body io.Reader = bufio.NewReader(&lr)
body, err = decodeHTML(body, resp.Header.Get("Content-Type"))
if err != nil {
return "", err
}
title, err := findHTMLTitle(body)
if err != nil {
return "", err
}
// XXX: I'm pretty sure there's some way of changing the suffix for
// server administrators, so this is in no way reliable. It's kind of
// dangerous because we may end up in a recursion. It's also two
// requests, which is a bit inefficient, though negligible.
if strings.HasSuffix(title, "- Invidious") {
ytUrl := *u
ytUrl.Host = "www.youtube.com"
if res, err := p.Preview(ctx, &ytUrl); res != "" && err == nil {
return res, err
}
}
return strings.TrimSpace(title), nil
}
func parseContentType(contentType string) string {
if _, params, err := mime.ParseMediaType(contentType); err == nil {
if name, ok := params["charset"]; ok {
return name
}
}
return ""
}
func decodeHTML(body io.Reader, contentType string) (io.Reader, error) {
// Try to find a charset in the Content-Type header
charset := parseContentType(contentType)
// Try to find <meta charset=""> tag in the first 1024 bytes
if charset == "" {
r := bufio.NewReader(body)
if data, err := r.Peek(1024); err == nil || err == io.EOF {
if name, err := findMetaCharset(bytes.NewReader(data)); err == nil {
charset = name
}
}
}
// If none of the methods above succeeded, assume UTF-8
if charset == "" {
charset = "utf-8"
}
enc, err := htmlindex.Get(charset)
if err != nil {
// Unknown charset
return nil, err
}
if name, _ := htmlindex.Name(enc); name != "utf-8" {
body = enc.NewDecoder().Reader(body)
}
return body, nil
}
func findMetaCharset(body io.Reader) (charset string, err error) {
z := html.NewTokenizer(body)
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
if errors.Is(z.Err(), io.EOF) {
err = z.Err()
}
return
case html.StartTagToken, html.SelfClosingTagToken:
name, hasAttr := z.TagName()
if string(name) == "meta" && hasAttr {
var httpEquiv, content string
for {
key, val, moreAttr := z.TagAttr()
switch string(key) {
case "charset":
charset = string(val)
return
case "http-equiv":
httpEquiv = string(val)
case "content":
content = string(val)
}
if !moreAttr {
break
}
}
if strings.ToLower(httpEquiv) == "content-type" {
charset = parseContentType(content)
return
}
}
}
}
return
}
func findHTMLTitle(body io.Reader) (title string, err error) {
parsingTitle := false
z := html.NewTokenizer(body)
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
if !errors.Is(z.Err(), io.EOF) {
err = z.Err()
}
return
case html.StartTagToken, html.EndTagToken:
name, _ := z.TagName()
if string(name) == "title" {
if tt == html.StartTagToken {
// Start buffering text tokens
parsingTitle = true
} else {
// </title>, return the result
return
}
}
case html.TextToken:
if parsingTitle {
title += string(z.Text())
}
}
}
}