~hokiegeek/seculardb

ref: 35780c9c8c7feb23a4cfcdd65ad3e93617fe4546 seculardb/scraper.go -rw-r--r-- 3.5 KiB
35780c9cHokieGeek Added link, made the HTML page look nicer 2 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
package seculardb

import (
	"bytes"
	"fmt"
	"strings"
	"sync"

	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"

	"git.sr.ht/~hokiegeek/htmlscrape"
)

// GuideURL is the URL for the WP page that has the Secular Homeschool Guide
const GuideURL = "https://www.secularhomeschooler.com/secular-homeschool-guide/"

// Build creates a DB object out of the page
func Build() (db DB, err error) {
	db.Entries = make([]Entry, 0)

	var mu sync.Mutex
	var wg sync.WaitGroup
	rows := make(chan *html.Node, 5)
	for w := 1; w <= 20; w++ {
		wg.Add(1)
		go func() {
			defer wg.Done()

			colMatcher := func(n *html.Node, name string) *html.Node {
				return htmlscrape.FindNode(n, htmlscrape.NewNodeMatcher().Type(html.ElementNode).Atom(atom.Td).Attr("class", name))
			}

			col := func(n *html.Node, name string) string {
				td := colMatcher(n, name)

				var buf bytes.Buffer
				for c := td.FirstChild; c != nil; c = c.NextSibling {
					switch {
					case c.Type == html.TextNode && strings.Contains(c.Data, "wp-content"):
						fallthrough
					case c.Type == html.ElementNode && c.DataAtom == atom.Center:
						b := htmlscrape.FindNode(td, htmlscrape.NewNodeMatcher().Type(html.ElementNode).Atom(atom.B))
						switch {
						case b == nil:
							buf.WriteString(c.FirstChild.Data)
						default:
							buf.WriteString(b.FirstChild.Data)
						}
					case c.Type == html.TextNode:
						buf.WriteString(c.Data)
					}
				}

				return buf.String()
			}

			for tr := range rows {
				name := col(tr, "column-1")
				gradeLevels := col(tr, "column-3") // []string?
				subjects := col(tr, "column-4")    // []string
				desc := col(tr, "column-5")
				linkNode := htmlscrape.FindNode(colMatcher(tr, "column-5"), htmlscrape.NewNodeMatcher().Atom(atom.A))
				var link string
				if linkNode != nil {
					for _, attr := range linkNode.Attr {
						if attr.Key == "href" {
							link = attr.Val
							if !strings.HasPrefix(link, "http") {
								link = "http://" + link
							}
						}
					}
				}

				ratingStr := strings.ToLower(col(tr, "column-2"))
				var rating int
				switch {
				case ratingStr == "":
					fallthrough
				case strings.Contains(ratingStr, "unconfirmed") || strings.Contains(ratingStr, "not confirmed") || ratingStr == "n/a":
					rating = 0
				case strings.Contains(ratingStr, "not secular"):
					rating = 1
				case strings.Contains(ratingStr, "can be secular."):
					fallthrough
				case strings.Contains(ratingStr, "neutral") || strings.Contains(ratingStr, "questionable"):
					fallthrough
				case strings.Contains(ratingStr, "the \"bible\" version is obviously not secular."):
					fallthrough
				case strings.Contains(ratingStr, "okay, so the book itself is technically secular"):
					rating = 2
				case strings.Contains(ratingStr, "mostly secular") || strings.Contains(ratingStr, "sorta secular?"):
					rating = 3
				case ratingStr == "secular":
					rating = 4
				case strings.Contains(ratingStr, "super secular!"):
					rating = 5
				default:
					fmt.Printf("[%s] rating: %s\n", name, ratingStr)
					panic("shit")
				}

				entry := Entry{
					Name:        name,
					Rating:      Rating(rating),
					GradeLevels: strings.Split(strings.Replace(gradeLevels, " ", "", -1), ","),
					Subjects:    strings.Split(strings.Replace(subjects, " ", "", -1), ","),
					Description: desc,
					URL:         link,
				}
				mu.Lock()
				db.Entries = append(db.Entries, entry)
				mu.Unlock()
			}
		}()
	}

	if err = htmlscrape.TableRows(GuideURL, rows, htmlscrape.NewNodeMatcher().Attr("id", "tablepress-1")); err == nil {
		wg.Wait()
	}

	return
}