~stick/crwlr

2e7af62639c9f8d9925997eb7d18743a5912ea5b — Stick 8 months ago main
play around with crawling
1 files changed, 60 insertions(+), 0 deletions(-)

A crwlr.go
A  => crwlr.go +60 -0
@@ 1,60 @@
package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"os"
	"strings"

	"github.com/anaskhan96/soup"
)

func errOut(err error) {
	fmt.Println(err)
	os.Exit(1)
}

func crawl(url string) map[string]int {
	found := map[string]int{}
	resp, err := http.Get(url)
	if err != nil {
		return found
	}
	defer resp.Body.Close()

	if resp.StatusCode == 200 {
		body, _ := ioutil.ReadAll(resp.Body)
		doc := soup.HTMLParse(string(body))
		links := doc.FindAll("a")
		for _, link := range links {
			href := strings.SplitN(link.Attrs()["href"], "#", 2)[0]
			if !strings.Contains(href, ":") {
				href = url + href
			}
			if strings.HasPrefix(href, "mailto") || strings.HasPrefix(href, "javascript:") {
				continue
			}
			found[href] = found[href] + 1
		}
	}

	return found
}

func main() {
	baseurl := "https://stma.is"
	found := crawl(baseurl)
	for url, _ := range found {
		level := crawl(url)
		for site, count := range level {
			found[site] = found[site] + count
		}
	}

	for url, count := range found {
		if count > 3 {
			fmt.Println(url, count)
		}
	}
}