~elektito/gemplex

5ef072f1dfd92c2e76918a0d1959b22f6cf7f995 — Mostafa Razavi 1 year, 7 months ago 60192b2
Extract and store ASCII art
3 files changed, 182 insertions(+), 11 deletions(-)

M cmd/gemplex/crawl.go
M cmd/gpctl/main.go
M pkg/gparse/gparse.go
M cmd/gemplex/crawl.go => cmd/gemplex/crawl.go +13 -0
@@ 287,6 287,19 @@ func updateDbSuccessfulVisit(r VisitResult) {
		panic(err)
	}

	for _, img := range r.page.Images {
		imgHash := calcContentHash([]byte(img.Value))
		fmt.Println("Insert:", imgHash)
		fmt.Println(img.Value)
		_, err = tx.Exec(`
insert into images (image_hash, image, alt, content_hash, url, fetch_time)
values ($1, $2, $3, $4, $5, $6)
on conflict (image_hash)
do nothing
`, imgHash, img.Value, img.AltText, contentHash, r.url, r.visitTime)
		utils.PanicOnErr(err)
	}

	var urlId int64
	err = tx.QueryRow(
		`update urls set

M cmd/gpctl/main.go => cmd/gpctl/main.go +83 -0
@@ 2,7 2,9 @@ package main

import (
	"context"
	"crypto/md5"
	"database/sql"
	"encoding/hex"
	"flag"
	"fmt"
	"net/url"


@@ 11,6 13,7 @@ import (
	"path"
	"strings"
	"syscall"
	"time"

	"git.sr.ht/~elektito/gemplex/pkg/config"
	"git.sr.ht/~elektito/gemplex/pkg/db"


@@ 54,6 57,11 @@ func init() {
			ShortUsage: "",
			Handler:    handlePageRankCommand,
		},
		"reimg": {
			Info:       "Update images (ascii art) table.",
			ShortUsage: "",
			Handler:    handleReImgCommand,
		},
		"reparse": {
			Info:       "Re-parse all pages in db, re-calculate columns we get from parsing, and write the results back to db.",
			ShortUsage: "",


@@ 411,6 419,81 @@ func handleUrlInfoCommand(cfg *config.Config, args []string) {
	}
}

func handleReImgCommand(cfg *config.Config, args []string) {
	db, err := sql.Open("postgres", cfg.GetDbConnStr())
	utils.PanicOnErr(err)
	defer db.Close()

	rows, err := db.Query(`
select c.id, c.content, c.hash, c.content_type, c.fetch_time, u.url
from contents c
join urls u on u.content_id=c.id
`)
	utils.PanicOnErr(err)
	defer rows.Close()

	type Record struct {
		image       string
		alt         string
		fetchTime   time.Time
		url         string
		contentHash string
	}
	var records []Record

	i := 0
	for rows.Next() {
		var cid int64
		var content []byte
		var fetchTime time.Time
		var ustr string
		var ct string
		var contentHash string
		err = rows.Scan(&cid, &content, &contentHash, &ct, &fetchTime, &ustr)
		utils.PanicOnErr(err)

		u, err := url.Parse(ustr)
		utils.PanicOnErr(err)

		page, err := gparse.ParsePage(content, u, ct)
		if err != nil {
			continue
		}

		for _, img := range page.Images {
			// fmt.Println("====", cid, img.AltText, ustr)
			// fmt.Println(img.Value)
			records = append(records, Record{
				image:       img.Value,
				alt:         img.AltText,
				url:         ustr,
				fetchTime:   fetchTime,
				contentHash: contentHash,
			})
		}

		i++
		if i%1000 == 0 {
			fmt.Println("Progress:", i)
		}
	}

	fmt.Printf("Writing %d image(s) to database...\n", len(records))
	for _, rec := range records {
		hashBytes := md5.Sum([]byte(rec.image))
		imgHash := hex.EncodeToString(hashBytes[:])
		_, err := db.Exec(`
insert into images (image_hash, content_hash, image, alt, fetch_time, url)
values ($1, $2, $3, $4, $5, $6)
on conflict
do nothing
`, imgHash, rec.contentHash, rec.image, rec.alt, rec.fetchTime, rec.url)
		utils.PanicOnErr(err)
	}

	fmt.Println("Done.")
}

func handleReparseCommand(cfg *config.Config, args []string) {
	// this sub-command re-parses all the contents in the database, checks if the
	// title has changes, and if so, saves the new titles to the database again.

M pkg/gparse/gparse.go => pkg/gparse/gparse.go +86 -11
@@ 19,7 19,9 @@ import (
)

const (
	maxTitleLength = 72
	maxTitleLength   = 72
	minAsciiArtSize  = 64
	minAsciiArtLines = 3
)

type Link struct {


@@ 32,6 34,11 @@ type Heading struct {
	Text  string
}

type Image struct {
	AltText string
	Value   string
}

type Page struct {
	Text     string
	Links    []Link


@@ 39,6 46,7 @@ type Page struct {
	Title    string
	Lang     string
	Kind     string
	Images   []Image
}

var (


@@ 51,6 59,7 @@ var (
	newlineSeqRe     = regexp.MustCompile(`(?m)\n{2,}`)
	allWhitespaceRe  = regexp.MustCompile(`^\s+$`)
	ansiSeqRe        = regexp.MustCompile("[\u001B\u009B][[\\]()#;?]*(?:(?:(?:[a-zA-Z\\d]*(?:;[a-zA-Z\\d]*)*)?\u0007)|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PRZcf-ntqry=><~]))") // from: https://github.com/acarl005/stripansi/blob/master/stripansi.go
	gitSummaryRe     = regexp.MustCompile(`\s*[MAD]\s+(.+)\s+\|\s+\d+\s+(\++-+|-+|\++)\s*`)
)

func ParsePlain(text string) (result Page) {


@@ 108,30 117,48 @@ func ParseGemtext(text string, base *url.URL) (result Page) {
	firstLine := ""
	inPre := false
	preText := ""
	preAll := ""
	preLineCount := 0
	altText := ""
	lines := strings.Split(text, "\n")
	for _, line := range lines {
		line = strings.TrimRight(line, " ")

		matches := preRe.FindStringSubmatch(line)
		if len(matches) > 0 {
			if !inPre && matches[1] != "" {
				altText := matches[1]
				s.WriteString(altText + "\n")
			}
			if !inPre {
				altText = matches[1]
				if altText != "" {
					s.WriteString(altText + "\n")
				}

				preAll = ""
				preText = ""
				preLineCount = 0
			} else {
				// we're trying not to index ascii art, but do index normal text
				// in a pre block
				if looksLikeText(preText) {
					s.WriteString(preText)
				}

				// add ascii art "images"
				if len(preAll) >= minAsciiArtSize && preLineCount >= minAsciiArtLines && isAsciiArt(preAll) {
					preAll = ansiSeqRe.ReplaceAllLiteralString(preAll, "")
					preAll = strings.Trim(preAll, "\r\n")
					result.Images = append(result.Images, Image{
						AltText: altText,
						Value:   preAll,
					})
				}
			}
			inPre = !inPre
			continue
		}

		if inPre {
			preLineCount++
			preAll += line + "\n"
			if isMostlyAlphanumeric(line) {
				preText += line + "\n"
			}


@@ 371,19 398,67 @@ func looksLikeText(s string) bool {
	return true
}

func classifyRunes(s string) (alphaNum int, space int, rest int) {
	for _, r := range s {
		switch {
		case unicode.IsSpace(r):
			space++
		case unicode.IsLetter(r) || unicode.IsDigit(r):
			alphaNum++
		default:
			rest++
		}
	}

	return
}

func isMostlyAlphanumeric(s string) bool {
	if s == "" {
	alphaNum, space, _ := classifyRunes(s)
	nonSpace := len(s) - space
	if nonSpace == 0 {
		return false
	}

	n := 0
	for _, r := range s {
		if unicode.IsLetter(r) || unicode.IsDigit(r) {
			n += 1
	return float64(alphaNum)/float64(nonSpace) > 0.6
}

func isAsciiArt(s string) bool {
	_, space, rest := classifyRunes(s)
	nonSpace := len(s) - space
	if nonSpace == 0 {
		return false
	}

	if float64(rest)/float64(nonSpace) < 0.75 {
		return false
	}

	tableSepRe := regexp.MustCompile(`^\s*[\-+|]+\s*$`)
	tableRowRe := regexp.MustCompile(`^\s*\|.*\|\s*$`)

	lines := strings.Split(s, "\n")
	tableSepLines := 0
	tableRowLines := 0
	for _, line := range lines {
		// exclude git summaries, like 'M README.md    | 5 +++--'
		if gitSummaryRe.MatchString(line) {
			return false
		}

		// also exclude tables
		if tableSepRe.MatchString(line) {
			tableSepLines++
		} else if tableRowRe.MatchString(line) {
			tableRowLines++
		}

		if tableSepLines >= 2 && tableRowLines >= 1 {
			return false
		}
	}

	return float64(n)/float64(len(s)) > 0.6
	return true
}

func convertToString(body []byte, contentType string) (s string, err error) {