// shout out to https://gitlab.com/Kanedias/html2md
use html5ever::{
tendril::TendrilSink,
QualName,
};
use maj::{
gemini as gemtext,
};
use markup5ever::{namespace_url, ns};
use markup5ever_rcdom::{Handle, NodeData, RcDom};
pub fn parse_html(html: &str) -> Vec<gemtext::Node> {
let context = QualName::new(
None,
ns!(html),
markup5ever::LocalName::from("div"),
);
let parser = html5ever::parse_fragment(
RcDom::default(),
html5ever::ParseOpts::default(),
context,
vec![],
);
let dom = parser.one(html);
let mut result = ParseState::default();
walk(&dom.document, &mut result);
result.finished
}
fn walk(input: &Handle, result: &mut ParseState) {
let mut new_context = None;
let mut was_tag = false;
match input.data {
NodeData::Document | NodeData::Doctype {..} | NodeData::ProcessingInstruction {..} | NodeData::Comment {..} => {},
NodeData::Text { ref contents } => {
let text = contents.borrow().to_string();
result.pending += &text;
}
NodeData::Element { ref name, ref attrs, .. } => {
was_tag = true;
let tag_name = name.local.to_string();
// no user-supplied factory, take one of built-in ones
new_context = match tag_name.as_ref() {
"p" => Some(Context::Paragraph),
"br" => {
result.finish();
Some(Context::Paragraph)
}
"blockquote" => Some(Context::Quote),
"h1" => Some(Context::Header(1)),
"h2" => Some(Context::Header(2)),
"h3" => Some(Context::Header(3)),
"h4" => Some(Context::Header(4)),
"h5" => Some(Context::Header(5)),
"h6" => Some(Context::Header(6)),
"pre" => Some(Context::Preformatted),
"a" => {
result.finish();
Some(Context::Link(attrs.borrow().iter().find_map(|attribute| {
if attribute.name.local == *"href" {
Some(attribute.value.to_string())
} else {
None
}
}).unwrap_or("".to_string())))
},
"li" => Some(Context::ListItem),
"span" => Some(Context::FlattenedOut),
"html" => Some(Context::Paragraph),
tag => {
log::info!("unknown tag <{}>", tag);
Some(Context::FlattenedOut)
}
};
if let Some(new_context) = &new_context {
result.context.push(new_context.clone());
}
}
}
for child in input.children.borrow().iter() {
use std::borrow::Borrow;
walk(child.borrow(), result);
}
if was_tag {
result.finish();
}
if new_context.take().is_some() {
result.context.pop();
}
}
#[derive(Default)]
struct ParseState {
finished: Vec<gemtext::Node>,
pending: String,
context: Vec<Context>,
}
impl ParseState {
fn finish(&mut self) {
if !self.pending.is_empty() && self.context.last().map_or(true, |c| c != &Context::FlattenedOut) {
let finished = std::mem::replace(&mut self.pending, String::new());
let context = self.context.last().unwrap_or(&Context::Paragraph);
self.finished.push(context.apply(finished));
}
}
}
#[derive(Clone, PartialEq)]
enum Context {
Paragraph,
Quote,
Header(u8),
Preformatted,
Link(String),
ListItem,
FlattenedOut,
}
impl Context {
fn apply(&self, contents: String) -> gemtext::Node {
match self {
Context::Paragraph => gemtext::Node::Text(contents),
Context::Quote => gemtext::Node::Quote(contents),
Context::Header(level) => gemtext::Node::Heading { level: *level, body: contents },
Context::Preformatted => gemtext::Node::Preformatted(contents),
Context::Link(href) => gemtext::Node::Link { to: href.clone(), name: Some(contents) },
Context::ListItem => gemtext::Node::ListItem(contents),
Context::FlattenedOut => gemtext::Node::Preformatted("this should not happen!".to_string()),
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_thing_that_keeps_not_working() {
let broken_example = r#"<p><span><a href="a">@<span>a</span></a></span> hi</p>"#;
let expected = gemtext::Builder::new()
.link("a", Some("@a".to_string()))
.text(" hi")
.build();
assert_eq!(expected, parse_html(broken_example));
}
#[test]
fn test_thing_that_should_work() {
let example = gemtext::Node::Text("#1 fav food: chocolate".to_string());
let mut result = Vec::<u8>::new();
gemtext::render(vec![example], &mut result).unwrap();
let result = String::from_utf8(result).unwrap();
let parsed = gemtext::parse(&result);
let not_what_i_asked_for = gemtext::Node::Heading { level: 1, body: "1 fav food: chocolate".to_string() };
assert_ne!(parsed[0], not_what_i_asked_for);
}
}