~djl/rollinss

48b1fb940f8e80c5120e516c89e27a4581689b13 — David Logie 3 years ago bb76a97
Partially parse the JSON so we can grab the MP3 URL.

Recently the format of the MP3 URLs has changed so now we're forced to
grab the URL from the JSON.
3 files changed, 51 insertions(+), 55 deletions(-)

M go.mod
M go.sum
M main.go
M go.mod => go.mod +1 -0
@@ 5,4 5,5 @@ go 1.16
require (
	github.com/PuerkitoBio/goquery v1.6.1 // indirect
	github.com/jbub/podcasts v0.1.0 // indirect
	github.com/tidwall/gjson v1.7.5 // indirect
)

M go.sum => go.sum +6 -0
@@ 4,6 4,12 @@ github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5z
github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
github.com/jbub/podcasts v0.1.0 h1:HctUllH79LIRo0Rvr6hkSK4fjsSKJ+AyVCQCYvjAa8U=
github.com/jbub/podcasts v0.1.0/go.mod h1:On2jleoJGKlW0ZDGgfXKtL5/+3uqZqDiT1k/ntmu/nA=
github.com/tidwall/gjson v1.7.5 h1:zmAN/xmX7OtpAkv4Ovfso60r/BiCi5IErCDYGNJu+uc=
github.com/tidwall/gjson v1.7.5/go.mod h1:5/xDoumyyDNerp2U36lyolv46b3uF/9Bu6OfyQ9GImk=
github.com/tidwall/match v1.0.3 h1:FQUVvBImDutD8wJLN6c5eMzWtjgONK9MwIBCOrUJKeE=
github.com/tidwall/match v1.0.3/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.1.0 h1:K3hMW5epkdAVwibsQEfR/7Zj0Qgt4DxtNumTq/VloO8=
github.com/tidwall/pretty v1.1.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI=

M main.go => main.go +44 -55
@@ 1,27 1,26 @@
package main

import (
	"crypto/sha1"
	"errors"
	"flag"
	"fmt"
	"io"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"strconv"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
	"github.com/jbub/podcasts"
	"github.com/tidwall/gjson"
)

const progname = "rollinss"
const version = "1.0.0"
const version = "1.0.1"

const endpoint = "https://www.kcrw.com/music/shows/henry-rollins"
const mp3link = "https://od-media.kcrw.com/kcrw/audio/website/music/hr/KCRW-henry_rollins-kcrw_broadcast_%d-%s.mp3"

type Episode struct {
	Title    string


@@ 32,89 31,79 @@ type Episode struct {
	Duration time.Duration
}

// This doesn't really generate a UUID. I'm too lazy to grab the JSON
// and get the real UUID. sha1 here is good enough
func genUUID(s string) string {
	h := sha1.New()
	h.Write([]byte(s))
	bs := h.Sum(nil)
	return fmt.Sprintf("%x", bs)
}
// Fetch given URL
func get(url string) (string, error) {
	res, err := http.Get(url)
	if err != nil {
		return "", err
	}
	defer res.Body.Close()

	if res.StatusCode != 200 {
		err = errors.New(fmt.Sprintf("status code error: %d %s", res.StatusCode, res.Status))
		return "", err
	}

// Take a string like "2h, 2min" and return a time.Duration
func getDuration(s string) time.Duration {
	s = strings.ReplaceAll(s, " ", "")
	s = strings.ReplaceAll(s, ",", "")
	s = strings.ReplaceAll(s, "hr", "h")
	s = strings.ReplaceAll(s, "min", "m")
	duration, err := time.ParseDuration(s)
	body, err := ioutil.ReadAll(res.Body)
	if err != nil {
		duration = time.Second * 120
		return "", err
	}
	return duration
}

// Given an episode number and time.Time, return a URL to
// the MP3 file for that episode.
func getMP3URL(epnum int, pubdate time.Time) string {
	return fmt.Sprintf(mp3link, epnum, pubdate.Format("060102"))
	return string(body), nil
}

// Get the episodes from the endpoint
// Errors will likely be either HTTP errors or HTML parsing errors
// (e.g. the HTML changed and this needs to be rewritten accordingly)
func fetchEpisodes(url string) ([]Episode, error) {
	res, err := http.Get(url)
	res, err := get(url)
	if err != nil {
		return nil, err
	}
	defer res.Body.Close()

	if res.StatusCode != 200 {
		err = errors.New(fmt.Sprintf("status code error: %d %s", res.StatusCode, res.Status))
		return nil, err
	}

	doc, err := goquery.NewDocumentFromReader(res.Body)
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(res))
	if err != nil {
		return nil, err
	}

	var episodes []Episode

	doc.Find("div#episodes div.four-col div.single").Each(func(i int, s *goquery.Selection) {
		link, _ := s.Find("a.title-link").Attr("href")
		title := s.Find("h3").Text()
		parts := strings.Split(title, " ")
		epnumStr := parts[len(parts)-1]
	doc.Find("div.four-col.hub-row.no-border button.audio").Each(func(i int, s *goquery.Selection) {
		jurl, exists := s.Attr("data-player-json")
		if !exists {
			return
		}

		// No episode number or no pub date means we can't generate
		// the link to the MP3, so just return here
		epnum, err := strconv.Atoi(epnumStr)
		res, err := get(jurl)
		if err != nil {
			return
		}

		var pubdate time.Time
		datestr, exists := s.Find("time.pubdate").Attr("datetime")
		if exists {
			parsed, err := time.Parse("2006-01-02T15:04:05Z", datestr)
			if err != nil {
				return
			}
			// The time.pubdate HTML is always off by one day for some reason
			// so we need to subtract one day
			pubdate = parsed.AddDate(0, 0, -1)
		json := string(res)
		id := gjson.Get(json, "uuid").String()
		link := gjson.Get(json, "url").String()
		title := gjson.Get(json, "title").String()
		mp3 := gjson.Get(json, "media.0.url").String()

		durstr := gjson.Get(json, "duration").Int()
		duration, err := time.ParseDuration(fmt.Sprintf("%ds", durstr))
		if err != nil {
			return
		}

		duration := getDuration(s.Find("span.duration").Text())
		mp3 := getMP3URL(epnum, pubdate)
		var pubdate time.Time
		datestr := gjson.Get(json, "date").String()
		parsed, err := time.Parse("2006-01-02T15:04:05Z", datestr)
		if err != nil {
			return
		}
		pubdate = parsed.AddDate(0, 0, -1)

		episode := Episode{
			Title:    title,
			Link:     link,
			MP3:      mp3,
			UUID:     genUUID(fmt.Sprintf("%s-%s", title, mp3)),
			UUID:     id,
			PubDate:  pubdate,
			Duration: duration,
		}