~kdsch/android-bulletin

41ce27c36f8fc0f42fb237987bb2f72775651dfa — Karl Schultheisz 3 years ago
first commit
4 files changed, 304 insertions(+), 0 deletions(-)

A go.mod
A go.sum
A main.go
A notes
A  => go.mod +8 -0
@@ 1,8 @@
module android-bulletin-scraper

go 1.16

require (
	github.com/PuerkitoBio/goquery v1.7.0
	golang.org/x/net v0.0.0-20210614182718-04defd469f4e // indirect
)

A  => go.sum +16 -0
@@ 1,16 @@
github.com/PuerkitoBio/goquery v1.7.0 h1:O5SP3b9JWqMSVMG69zMfj577zwkSNpxrFf7ybS74eiw=
github.com/PuerkitoBio/goquery v1.7.0/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e h1:XpT3nA5TvE525Ne3hInMh6+GETgn27Zfm9dxsThnX2Q=
golang.org/x/net v0.0.0-20210614182718-04defd469f4e/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

A  => main.go +247 -0
@@ 1,247 @@
package main

import (
	"os"
	"encoding/csv"
	"fmt"
	"io"
	"net/http"
	"strings"
	"sync"

	"github.com/PuerkitoBio/goquery"
)

const baseURL = "https://source.android.com"

func getBulletinURLs() (urls []string) {
	url := baseURL + "/security/bulletin"
	resp, err := http.Get(url)
	if err != nil {
		fmt.Println(err)
		return
	}

	switch resp.StatusCode {
	case 200:
		// OK
	default:
		fmt.Printf("%d GET %s\n", resp.StatusCode, url)
		return
	}

	doc, err := goquery.NewDocumentFromReader(resp.Body)
	if err != nil {
		fmt.Println(err)
		return
	}

	doc.Find("td a").Each(func(_ int, s *goquery.Selection) {
		if s.Text() == "English" {
			path, ok := s.Attr("href")
			if ok {
				urls = append(urls, baseURL+path)
			}
		}
	})
	return urls
}

func getBulletin(url string) io.Reader {
	resp, err := http.Get(url)
	if err != nil {
		fmt.Println(err)
		return nil
	}

	switch resp.StatusCode {
	case 200:
		// OK
	default:
		fmt.Printf("%d GET %s\n", resp.StatusCode, url)
		return nil
	}

	return resp.Body
}

type entry struct {
	patchLevel string
	component  string
	component2 string
	cve        string
	Type       string
	severity   string
	versions   []string
}

func (e entry) String() string {
	var patched, component, kind string
	if e.component2 != "" {
		component = e.component + ": " + e.component2
	} else {
		component = e.component
	}

	if e.Type == "" {
		kind = "Unknown"
	} else {
		kind = e.Type
	}

	if len(e.versions) != 0 {
		patched = fmt.Sprintf("%s in %s", e.patchLevel, strings.Join(e.versions, ", "))
	} else {
		patched = e.patchLevel
	}
	return fmt.Sprintf("%s, %s-rated %s, patched %s, affects %s",
		e.cve,
		e.severity,
		kind,
		patched,
		component,
	)
}

// patch level heading: h2 span.devsite-heading
// component heading: h3 span.devsite-heading
// table entry: div.devsite-table-wrapper table tbody tr
func getCVEs(bulletin io.Reader) (entries []entry) {
	doc, err := goquery.NewDocumentFromReader(bulletin)
	if err != nil {
		fmt.Println(err)
		return
	}
	doc.Find("h2").Each(func(_ int, h2 *goquery.Selection) {
		// Each patch level begins with a 21st-century year.
		if !strings.HasPrefix(h2.Text(), "20") {
			return
		}

		patchLevel := strings.Fields(h2.Text())[0]
		h2.NextFilteredUntil("h3", "h2").Each(func(_ int, h3 *goquery.Selection) {
			component := h3.Text()
			colgroup := h3.NextUntil("h3").Find("colgroup")
			if colgroup.Children().Length() != 5 {
				return
			}

			tableType := 0
			colgroup.Next().Children().Each(func(i int, row *goquery.Selection) {
				if i == 0 {
					// the last column is either versions or component2
					switch row.Children().Last().Text() {
					case "Component":
						tableType = 0
					default:
						tableType = 1
					}
					return
				}

				e := entry{
					patchLevel: patchLevel,
					component:  component,
				}
				row.Children().Each(func(j int, field *goquery.Selection) {
					// cve references type severity versions
					switch j {
					case 0:
						e.cve = strings.TrimSpace(field.Text())
					case 1:
						// TODO: e.references
					case 2:
						e.Type = field.Text()
					case 3:
						e.severity = field.Text()
					case 4:
						switch tableType {
						case 0:
							e.component2 = field.Text()

						case 1:
							e.versions = strings.Split(field.Text(), ", ")
						}
					}
				})
				entries = append(entries, e)
			})
		})
	})
	return entries
}

func getBulletinStream(urls []string) <-chan io.Reader {
	bulletins := make(chan io.Reader, 20)
	var wg sync.WaitGroup
	wg.Add(len(urls))
	for _, url := range urls {
		go func(url string) {
			bulletins <- getBulletin(url)
			wg.Done()
		}(url)
	}
	go func() {
		wg.Wait()
		close(bulletins)
	}()
	return bulletins
}

func getCVEsFromBulletinStream(bulletins <-chan io.Reader) (entries []entry) {
	for b := range bulletins {
		entries = append(entries, getCVEs(b)...)
	}
	return
}

func writeEntriesToCSV(entries []entry, filename string) {
	f, err := os.Create(filename)
	if err != nil {
		fmt.Println(err)
		return
	}
	defer f.Close()

	w := csv.NewWriter(f)
	w.Comma = '\t'
	defer w.Flush()

	headings := []string {
		"patchlevel",
		"component",
		"component2",
		"cve",
		"type",
		"severity",
		"versions",
	}

	if err := w.Write(headings); err != nil {
		fmt.Println(err)
		return
	}

	for _, e := range entries {
		fields := []string{
			e.patchLevel,
			e.component,
			e.component2,
			e.cve,
			e.Type,
			e.severity,
			strings.Join(e.versions, " "),
		}
		if err := w.Write(fields); err != nil {
			fmt.Println(err)
			return
		}
	}
}

func main() {
	urls := getBulletinURLs()
	bulletins := getBulletinStream(urls)
	entries := getCVEsFromBulletinStream(bulletins)
	writeEntriesToCSV(entries, "vulns.csv")
}

A  => notes +33 -0
@@ 1,33 @@
Suppose you have an Android device and you want load it with a
different operating system---like postmarketOS. Most Android
devices come with locked bootloaders, some of which are harder
to unlock than others. Many were never meant to be unlocked by
users. The reason is often that smartphones are sold at a
loss which is hoped to be regained through data-collecting
services that infringe on user privacy and promote consumption
through targeted advertising. Big tech has to remain in control
of these devices if it is going to recoup its lost revenue.

The movement for user freedom has two ways to cope with this situation.
First is to petition the government to regulate companies to treat
consumers better. Good luck; big tech has a powerful political lobby.
Second is to organize a community around exploiting vulnerabilities in
Android devices to liberate their users from corporate control.

When in possession of a particular Android device, it is critical to
get as much information about it as possible so that its vulnerabilities
can be understood. That is the rationale for compiling a database of
CVEs.


Android security bulletins are published only as HTML. To make them
more useful, we need to convert them into a database over which we can
run queries.

One way to do this is by developing a data model of the bulletins,
downloading the bulletin pages, analyzing them, and writing the data to
a file. Data modeling and analysis are tricky, because the data isn't
in a consistent format.

For file formats, nosql, sqlite, and csv were considered. CSV was chosen
because of simplicity and accessibility to a wide range of users.