M cmd/gemplex/crawl.go => cmd/gemplex/crawl.go +13 -0
@@ 287,6 287,19 @@ func updateDbSuccessfulVisit(r VisitResult) {
panic(err)
}
+ for _, img := range r.page.Images {
+ imgHash := calcContentHash([]byte(img.Value))
+ fmt.Println("Insert:", imgHash)
+ fmt.Println(img.Value)
+ _, err = tx.Exec(`
+insert into images (image_hash, image, alt, content_hash, url, fetch_time)
+values ($1, $2, $3, $4, $5, $6)
+on conflict (image_hash)
+do nothing
+`, imgHash, img.Value, img.AltText, contentHash, r.url, r.visitTime)
+ utils.PanicOnErr(err)
+ }
+
var urlId int64
err = tx.QueryRow(
`update urls set
M cmd/gpctl/main.go => cmd/gpctl/main.go +83 -0
@@ 2,7 2,9 @@ package main
import (
"context"
+ "crypto/md5"
"database/sql"
+ "encoding/hex"
"flag"
"fmt"
"net/url"
@@ 11,6 13,7 @@ import (
"path"
"strings"
"syscall"
+ "time"
"git.sr.ht/~elektito/gemplex/pkg/config"
"git.sr.ht/~elektito/gemplex/pkg/db"
@@ 54,6 57,11 @@ func init() {
ShortUsage: "",
Handler: handlePageRankCommand,
},
+ "reimg": {
+ Info: "Update images (ascii art) table.",
+ ShortUsage: "",
+ Handler: handleReImgCommand,
+ },
"reparse": {
Info: "Re-parse all pages in db, re-calculate columns we get from parsing, and write the results back to db.",
ShortUsage: "",
@@ 411,6 419,81 @@ func handleUrlInfoCommand(cfg *config.Config, args []string) {
}
}
+func handleReImgCommand(cfg *config.Config, args []string) {
+ db, err := sql.Open("postgres", cfg.GetDbConnStr())
+ utils.PanicOnErr(err)
+ defer db.Close()
+
+ rows, err := db.Query(`
+select c.id, c.content, c.hash, c.content_type, c.fetch_time, u.url
+from contents c
+join urls u on u.content_id=c.id
+`)
+ utils.PanicOnErr(err)
+ defer rows.Close()
+
+ type Record struct {
+ image string
+ alt string
+ fetchTime time.Time
+ url string
+ contentHash string
+ }
+ var records []Record
+
+ i := 0
+ for rows.Next() {
+ var cid int64
+ var content []byte
+ var fetchTime time.Time
+ var ustr string
+ var ct string
+ var contentHash string
+ err = rows.Scan(&cid, &content, &contentHash, &ct, &fetchTime, &ustr)
+ utils.PanicOnErr(err)
+
+ u, err := url.Parse(ustr)
+ utils.PanicOnErr(err)
+
+ page, err := gparse.ParsePage(content, u, ct)
+ if err != nil {
+ continue
+ }
+
+ for _, img := range page.Images {
+ // fmt.Println("====", cid, img.AltText, ustr)
+ // fmt.Println(img.Value)
+ records = append(records, Record{
+ image: img.Value,
+ alt: img.AltText,
+ url: ustr,
+ fetchTime: fetchTime,
+ contentHash: contentHash,
+ })
+ }
+
+ i++
+ if i%1000 == 0 {
+ fmt.Println("Progress:", i)
+ }
+ }
+
+ fmt.Printf("Writing %d image(s) to database...\n", len(records))
+ for _, rec := range records {
+ hashBytes := md5.Sum([]byte(rec.image))
+ imgHash := hex.EncodeToString(hashBytes[:])
+ _, err := db.Exec(`
+insert into images (image_hash, content_hash, image, alt, fetch_time, url)
+values ($1, $2, $3, $4, $5, $6)
+on conflict
+do nothing
+`, imgHash, rec.contentHash, rec.image, rec.alt, rec.fetchTime, rec.url)
+ utils.PanicOnErr(err)
+ }
+
+ fmt.Println("Done.")
+}
+
func handleReparseCommand(cfg *config.Config, args []string) {
// this sub-command re-parses all the contents in the database, checks if the
// title has changes, and if so, saves the new titles to the database again.
M pkg/gparse/gparse.go => pkg/gparse/gparse.go +86 -11
@@ 19,7 19,9 @@ import (
)
const (
- maxTitleLength = 72
+ maxTitleLength = 72
+ minAsciiArtSize = 64
+ minAsciiArtLines = 3
)
type Link struct {
@@ 32,6 34,11 @@ type Heading struct {
Text string
}
+type Image struct {
+ AltText string
+ Value string
+}
+
type Page struct {
Text string
Links []Link
@@ 39,6 46,7 @@ type Page struct {
Title string
Lang string
Kind string
+ Images []Image
}
var (
@@ 51,6 59,7 @@ var (
newlineSeqRe = regexp.MustCompile(`(?m)\n{2,}`)
allWhitespaceRe = regexp.MustCompile(`^\s+$`)
ansiSeqRe = regexp.MustCompile("[\u001B\u009B][[\\]()#;?]*(?:(?:(?:[a-zA-Z\\d]*(?:;[a-zA-Z\\d]*)*)?\u0007)|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PRZcf-ntqry=><~]))") // from: https://github.com/acarl005/stripansi/blob/master/stripansi.go
+ gitSummaryRe = regexp.MustCompile(`\s*[MAD]\s+(.+)\s+\|\s+\d+\s+(\++-+|-+|\++)\s*`)
)
func ParsePlain(text string) (result Page) {
@@ 108,30 117,48 @@ func ParseGemtext(text string, base *url.URL) (result Page) {
firstLine := ""
inPre := false
preText := ""
+ preAll := ""
+ preLineCount := 0
+ altText := ""
lines := strings.Split(text, "\n")
for _, line := range lines {
line = strings.TrimRight(line, " ")
matches := preRe.FindStringSubmatch(line)
if len(matches) > 0 {
- if !inPre && matches[1] != "" {
- altText := matches[1]
- s.WriteString(altText + "\n")
- }
if !inPre {
+ altText = matches[1]
+ if altText != "" {
+ s.WriteString(altText + "\n")
+ }
+
+ preAll = ""
preText = ""
+ preLineCount = 0
} else {
// we're trying not to index ascii art, but do index normal text
// in a pre block
if looksLikeText(preText) {
s.WriteString(preText)
}
+
+ // add ascii art "images"
+ if len(preAll) >= minAsciiArtSize && preLineCount >= minAsciiArtLines && isAsciiArt(preAll) {
+ preAll = ansiSeqRe.ReplaceAllLiteralString(preAll, "")
+ preAll = strings.Trim(preAll, "\r\n")
+ result.Images = append(result.Images, Image{
+ AltText: altText,
+ Value: preAll,
+ })
+ }
}
inPre = !inPre
continue
}
if inPre {
+ preLineCount++
+ preAll += line + "\n"
if isMostlyAlphanumeric(line) {
preText += line + "\n"
}
@@ 371,19 398,67 @@ func looksLikeText(s string) bool {
return true
}
+func classifyRunes(s string) (alphaNum int, space int, rest int) {
+ for _, r := range s {
+ switch {
+ case unicode.IsSpace(r):
+ space++
+ case unicode.IsLetter(r) || unicode.IsDigit(r):
+ alphaNum++
+ default:
+ rest++
+ }
+ }
+
+ return
+}
+
func isMostlyAlphanumeric(s string) bool {
- if s == "" {
+ alphaNum, space, _ := classifyRunes(s)
+ nonSpace := len(s) - space
+ if nonSpace == 0 {
return false
}
- n := 0
- for _, r := range s {
- if unicode.IsLetter(r) || unicode.IsDigit(r) {
- n += 1
+ return float64(alphaNum)/float64(nonSpace) > 0.6
+}
+
+func isAsciiArt(s string) bool {
+ _, space, rest := classifyRunes(s)
+ nonSpace := len(s) - space
+ if nonSpace == 0 {
+ return false
+ }
+
+ if float64(rest)/float64(nonSpace) < 0.75 {
+ return false
+ }
+
+ tableSepRe := regexp.MustCompile(`^\s*[\-+|]+\s*$`)
+ tableRowRe := regexp.MustCompile(`^\s*\|.*\|\s*$`)
+
+ lines := strings.Split(s, "\n")
+ tableSepLines := 0
+ tableRowLines := 0
+ for _, line := range lines {
+ // exclude git summaries, like 'M README.md | 5 +++--'
+ if gitSummaryRe.MatchString(line) {
+ return false
+ }
+
+ // also exclude tables
+ if tableSepRe.MatchString(line) {
+ tableSepLines++
+ } else if tableRowRe.MatchString(line) {
+ tableRowLines++
+ }
+
+ if tableSepLines >= 2 && tableRowLines >= 1 {
+ return false
}
}
- return float64(n)/float64(len(s)) > 0.6
+ return true
}
func convertToString(body []byte, contentType string) (s string, err error) {