~boringcactus/gemifedi

ref: b5f6c154e62c1fc4ea2ec57459bb05fdb8fda4a5 gemifedi/src/html2gemtext.rs -rw-r--r-- 4.7 KiB
b5f6c154 — Melody Horn bump to v0.2.1 1 year, 3 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
// shout out to https://gitlab.com/Kanedias/html2md

use html5ever::{
    tendril::TendrilSink,
    QualName,
};
use maj::{
    gemini as gemtext,
};
use markup5ever::{namespace_url, ns};
use markup5ever_rcdom::{Handle, NodeData, RcDom};

pub fn parse_html(html: &str) -> Vec<gemtext::Node> {
    let context = QualName::new(
        None,
        ns!(html),
        markup5ever::LocalName::from("div"),
    );
    let parser = html5ever::parse_fragment(
        RcDom::default(),
        html5ever::ParseOpts::default(),
        context,
        vec![],
    );
    let dom = parser.one(html);

    let mut result = ParseState::default();
    walk(&dom.document, &mut result);

    result.finished
}

fn walk(input: &Handle, result: &mut ParseState) {
    let mut new_context = None;
    let mut was_tag = false;
    match input.data {
        NodeData::Document | NodeData::Doctype {..} | NodeData::ProcessingInstruction {..} | NodeData::Comment {..} => {},
        NodeData::Text { ref contents }  => {
            let text = contents.borrow().to_string();
            result.pending += &text;
        }
        NodeData::Element { ref name, ref attrs, .. } => {
            was_tag = true;
            let tag_name = name.local.to_string();
            // no user-supplied factory, take one of built-in ones
            new_context = match tag_name.as_ref() {
                "p" => Some(Context::Paragraph),
                "br" => {
                    result.finish();
                    Some(Context::Paragraph)
                }
                "blockquote" => Some(Context::Quote),
                "h1" => Some(Context::Header(1)),
                "h2" => Some(Context::Header(2)),
                "h3" => Some(Context::Header(3)),
                "h4" => Some(Context::Header(4)),
                "h5" => Some(Context::Header(5)),
                "h6" => Some(Context::Header(6)),
                "pre" => Some(Context::Preformatted),
                "a" => {
                    result.finish();
                    Some(Context::Link(attrs.borrow().iter().find_map(|attribute| {
                        if attribute.name.local == *"href" {
                            Some(attribute.value.to_string())
                        } else {
                            None
                        }
                    }).unwrap_or("".to_string())))
                },
                "li" => Some(Context::ListItem),
                "span" => Some(Context::FlattenedOut),
                "html" => Some(Context::Paragraph),
                tag => {
                    log::info!("unknown tag <{}>", tag);
                    Some(Context::FlattenedOut)
                }
            };
            if let Some(new_context) = &new_context {
                result.context.push(new_context.clone());
            }
        }
    }

    for child in input.children.borrow().iter() {
        use std::borrow::Borrow;
        walk(child.borrow(), result);
    }

    if was_tag {
        result.finish();
    }
    if new_context.take().is_some() {
        result.context.pop();
    }
}

#[derive(Default)]
struct ParseState {
    finished: Vec<gemtext::Node>,
    pending: String,
    context: Vec<Context>,
}

impl ParseState {
    fn finish(&mut self) {
        if !self.pending.is_empty() && self.context.last().map_or(true, |c| c != &Context::FlattenedOut) {
            let finished = std::mem::replace(&mut self.pending, String::new());
            let context = self.context.last().unwrap_or(&Context::Paragraph);
            self.finished.push(context.apply(finished));
        }
    }
}

#[derive(Clone, PartialEq)]
enum Context {
    Paragraph,
    Quote,
    Header(u8),
    Preformatted,
    Link(String),
    ListItem,
    FlattenedOut,
}

impl Context {
    fn apply(&self, contents: String) -> gemtext::Node {
        match self {
            Context::Paragraph => gemtext::Node::Text(contents),
            Context::Quote => gemtext::Node::Quote(contents),
            Context::Header(level) => gemtext::Node::Heading { level: *level, body: contents },
            Context::Preformatted => gemtext::Node::Preformatted(contents),
            Context::Link(href) => gemtext::Node::Link { to: href.clone(), name: Some(contents) },
            Context::ListItem => gemtext::Node::ListItem(contents),
            Context::FlattenedOut => gemtext::Node::Preformatted("this should not happen!".to_string()),
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_thing_that_keeps_not_working() {
        let broken_example = r#"<p><span><a href="a">@<span>a</span></a></span> hi</p>"#;
        let expected = gemtext::Builder::new()
            .link("a", Some("@a".to_string()))
            .text(" hi")
            .build();
        assert_eq!(expected, parse_html(broken_example));
    }
}