~m15o/htmlj

8dda81d963cca6178148e840897fae78c60c5a17 — m15o 1 year, 11 months ago dc8ab7d
Rely on html parser only to manipulate dom
1 files changed, 8 insertions(+), 14 deletions(-)

M htmlj.go
M htmlj.go => htmlj.go +8 -14
@@ 51,16 51,6 @@ func Parse(r io.Reader) (*Journal, error) {
	return j, err
}

var hre = regexp.MustCompile(`<h[12][^>]*>.*?</h[12]>`)
var are = regexp.MustCompile(`<article[^>]*>`)

func extractContent(s string) string {
	rv := are.ReplaceAllString(s, "")
	rv = hre.ReplaceAllString(rv, "")
	rv = strings.Replace(rv, "</article>", "", 1)
	return strings.TrimSpace(rv)
}

func walk(n *html.Node, j *Journal) error {
	if n.Type == html.ElementNode && n.Data == "h1" {
		if len(j.Title) == 0 {


@@ 69,16 59,20 @@ func walk(n *html.Node, j *Journal) error {
	}

	if n.Type == html.ElementNode && (n.Data == "h1" || n.Data == "h2") {
		title := n.FirstChild.Data
		title := strings.TrimSpace(n.FirstChild.Data)
		if re.MatchString(title) {
			p := n.Parent
			p.RemoveChild(n)
			var b bytes.Buffer
			if err := html.Render(&b, n.Parent); err != nil {
				return err
			for c := p.FirstChild; c != nil; c = c.NextSibling {
				if err := html.Render(&b, c); err != nil {
					return err
				}
			}
			j.Entries = append(j.Entries, Entry{
				Title:     title,
				Published: title[0:10],
				Content:   extractContent(b.String()),
				Content:   strings.TrimSpace(b.String()),
			})
			// TODO: remove when all journals have migrated to use h2
			if n.Data == "h1" {