// Copyright 2023 Hugo Osvaldo Barrera
//
// SPDX-License-Identifier: ISC
//! See [`Parser`] as the main entry point to this library.
#![warn(clippy::pedantic)]
use std::{
borrow::Cow,
iter::{Enumerate, Peekable},
str::Bytes,
};
/// A valid content line.
///
/// Continuation lines may be wrapped and separated by a CRLF immediately followed by a single
/// linear white-space character (i.e., SPACE or HTAB).
#[derive(Debug, PartialEq, Clone)]
pub struct ContentLine<'input> {
// TODO: use indeces instead; they're half the size and slightly simpler.
/// The entire raw line, unaltered.
raw: &'input str,
/// Everything before the first colon or semicolon.
name: &'input str,
/// Everything before the first colon and after the first semicolon.
params: &'input str,
/// Everything after the first unquoted colon.
value: &'input str,
}
impl<'input> ContentLine<'input> {
/// Return the raw line without any unfolding.
#[must_use]
pub fn raw(&self) -> &'input str {
self.raw
}
/// Return this line's name, with continuation lines folded.
#[must_use]
pub fn name(&self) -> Cow<'input, str> {
fold_lines(self.name)
}
/// Return this line's parameter(s), with continuation lines folded.
#[must_use]
pub fn params(&self) -> Cow<'input, str> {
fold_lines(self.params)
}
/// Return this line's value, with continuation lines folded.
#[must_use]
pub fn value(&self) -> Cow<'input, str> {
fold_lines(self.value)
}
/// Normalise wrapping by extending each line to be as long as possible.
///
/// # Panics
///
/// Not implemented.
#[must_use]
pub fn re_wrapped(&self) -> Cow<'input, str> {
todo!()
}
}
/// A flexible parser for icalendar/vcard.
///
/// This parser is designed to allow malformed input as much as possible for vdirsyncer's
/// specific use case.
///
/// It should be used via its [`Iterator`] implementation which iterates over [`ContentLine`]
/// instances.
///
/// # Known issues
///
/// - A trailing empty line is lost.
pub struct Parser<'data> {
data: &'data str,
characters: Peekable<Enumerate<Bytes<'data>>>,
}
impl<'data> Parser<'data> {
/// Create a new parser with the given input data.
///
/// The input data MAY have unfolded continuation lines.
#[must_use]
pub fn new(data: &'data str) -> Parser<'data> {
Parser {
data,
characters: data.bytes().enumerate().peekable(),
}
}
/// Returns the unparsed portion of the input data.
///
/// Does not affect advance the position of this iterator.
#[must_use]
pub fn remainder(&mut self) -> &str {
&self.data[self
.characters
.peek()
.map_or_else(|| self.data.len(), |(i, _)| *i)..]
}
}
impl<'data> Iterator for Parser<'data> {
type Item = ContentLine<'data>;
/// Returns the next content line from the inner data.
///
/// Returns `None` after the last line has been returned. Returns `None` if called after the
/// iterator has been exhausted.
#[allow(clippy::too_many_lines)]
fn next(&mut self) -> Option<ContentLine<'data>> {
let (start, _) = *self.characters.peek()?;
loop {
match self.characters.next() {
Some((semicolon, b';')) => loop {
match self.characters.next() {
Some((colon, b':')) => loop {
match self.characters.next() {
Some((cr, b'\r')) => {
if !matches!(self.characters.peek(), Some((_, b'\n'))) {
continue; // Not CRLF.
};
self.characters.next(); // Advance the peeked LF.
if matches!(self.characters.peek(), Some((_, b' ' | b'\t'))) {
continue; // Continuation line
}
return Some(ContentLine {
raw: &self.data[start..cr],
name: &self.data[start..semicolon],
params: &self.data[semicolon + 1..colon],
value: &self.data[colon + 1..cr],
});
}
Some((_, _)) => {}
None => {
return Some(ContentLine {
raw: &self.data[start..],
name: &self.data[start..semicolon],
params: &self.data[semicolon + 1..colon],
value: &self.data[colon + 1..],
})
}
}
},
Some((_, b'"')) => loop {
match self.characters.next() {
Some((_, b'"')) => break,
Some((_, _)) => {}
None => {
// WARN: reached EOF, expected closing quote
return Some(ContentLine {
raw: &self.data[start..],
name: &self.data[start..semicolon],
params: &self.data[semicolon + 1..],
value: &self.data[semicolon..semicolon],
});
}
}
},
Some((cr, b'\r')) => {
if !matches!(self.characters.peek(), Some((_, b'\n'))) {
continue; // Not CRLF.
};
self.characters.next(); // Advance the peeked LF.
if matches!(self.characters.peek(), Some((_, b' ' | b'\t'))) {
continue; // Continuation line
}
return Some(ContentLine {
raw: &self.data[start..cr],
name: &self.data[start..semicolon],
params: &self.data[semicolon + 1..],
value: &self.data[semicolon..semicolon],
});
}
Some((_, _)) => {}
None => {
return Some(ContentLine {
raw: &self.data[start..],
name: &self.data[start..semicolon],
params: &self.data[semicolon + 1..],
value: &self.data[semicolon..semicolon],
});
}
};
},
// Begin value
Some((colon, b':')) => loop {
match self.characters.next() {
Some((cr, b'\r')) => {
if !matches!(self.characters.peek(), Some((_, b'\n'))) {
continue; // Not CRLF.
};
self.characters.next(); // Advance the peeked LF.
if matches!(self.characters.peek(), Some((_, b' ' | b'\t'))) {
continue; // Continuation line
}
return Some(ContentLine {
raw: &self.data[start..cr],
name: &self.data[start..colon],
params: &self.data[colon..colon],
value: &self.data[colon + 1..cr],
});
}
Some((_, _)) => {}
None => {
return Some(ContentLine {
raw: &self.data[start..],
name: &self.data[start..colon],
params: &self.data[colon..colon],
value: &self.data[colon + 1..],
});
}
}
},
Some((cr, b'\r')) => {
if !matches!(self.characters.peek(), Some((_, b'\n'))) {
continue; // Not CRLF.
};
self.characters.next(); // Advance the peeked LF.
if matches!(self.characters.peek(), Some((_, b' ' | b'\t'))) {
continue; // Continuation line
}
return Some(ContentLine {
raw: &self.data[start..cr],
name: &self.data[start..cr],
params: &self.data[start..start],
value: &self.data[start..start],
});
}
Some((_, _)) => {}
None => {
return Some(ContentLine {
raw: &self.data[start..],
name: &self.data[start..],
params: &self.data[start..start],
value: &self.data[start..start],
});
}
}
}
}
}
#[cfg(test)]
mod test {
use crate::{fold_lines, ContentLine, Parser};
#[test]
fn test_complete_example() {
let data = vec![
"BEGIN:VCALENDAR",
"VERSION:2.0",
"PRODID:nl.whynothugo.todoman",
"BEGIN:VTODO",
"DTSTAMP:20231126T095923Z",
"DUE;TZID=Asia/Shanghai:20231128T090000",
"SUMMARY:dummy todo for parser tests",
"UID:565f48cb5b424815a2ba5e56555e2832@destiny.whynothugo.nl",
"END:VTODO",
"END:VCALENDAR",
// Note: this calendar is not entirely semantically valid;
// it is missing the timezone which is referred to in DUE.
]
.join("\r\n");
let mut parser = Parser::new(&data);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "BEGIN:VCALENDAR",
name: "BEGIN",
params: "",
value: "VCALENDAR"
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "VERSION:2.0",
name: "VERSION",
params: "",
value: "2.0",
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "PRODID:nl.whynothugo.todoman",
name: "PRODID",
params: "",
value: "nl.whynothugo.todoman",
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "BEGIN:VTODO",
name: "BEGIN",
params: "",
value: "VTODO",
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "DTSTAMP:20231126T095923Z",
name: "DTSTAMP",
params: "",
value: "20231126T095923Z",
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "DUE;TZID=Asia/Shanghai:20231128T090000",
name: "DUE",
params: "TZID=Asia/Shanghai",
value: "20231128T090000",
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "SUMMARY:dummy todo for parser tests",
name: "SUMMARY",
params: "",
value: "dummy todo for parser tests",
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "UID:565f48cb5b424815a2ba5e56555e2832@destiny.whynothugo.nl",
name: "UID",
params: "",
value: "565f48cb5b424815a2ba5e56555e2832@destiny.whynothugo.nl",
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "END:VTODO",
name: "END",
params: "",
value: "VTODO",
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "END:VCALENDAR",
name: "END",
params: "",
value: "VCALENDAR",
})
);
assert_eq!(parser.next(), None);
}
#[test]
fn test_empty_data() {
let data = "";
let mut parser = Parser::new(&data);
assert_eq!(parser.next(), None);
}
#[test]
fn test_empty_lines() {
// A line followed by CRLF is a different code-path than a line followed by EOF.
let data = "\r\n";
let mut parser = Parser::new(&data);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "",
name: "",
params: "",
value: "",
})
);
// FIXME: trailing empty lines are swallowed.
// assert_eq!(
// parser.next(),
// Some(ContentLine {
// raw: "",
// name: "",
// params: "",
// value: "",
// })
// );
assert_eq!(parser.next(), None);
}
#[test]
fn test_line_with_params() {
// A line with ending in CRLF is a different code-path than a line in EOF.
let data = vec![
"DTSTART;TZID=America/New_York:19970902T090000",
"DTSTART;TZID=America/New_York:19970902T090000",
]
.join("\r\n");
let mut parser = Parser::new(&data);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "DTSTART;TZID=America/New_York:19970902T090000",
name: "DTSTART",
params: "TZID=America/New_York",
value: "19970902T090000",
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "DTSTART;TZID=America/New_York:19970902T090000",
name: "DTSTART",
params: "TZID=America/New_York",
value: "19970902T090000",
})
);
assert_eq!(parser.next(), None);
}
#[test]
fn test_line_with_dquote() {
// A line with ending in CRLF is a different code-path than a line in EOF.
let data = vec![
"SUMMARY:This has \"some quotes\"",
"DTSTART;TZID=\"local;VALUE=DATE-TIME\":20150304T184500",
]
.join("\r\n");
let mut parser = Parser::new(&data);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "SUMMARY:This has \"some quotes\"",
name: "SUMMARY",
params: "",
value: "This has \"some quotes\"",
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "DTSTART;TZID=\"local;VALUE=DATE-TIME\":20150304T184500",
name: "DTSTART",
params: "TZID=\"local;VALUE=DATE-TIME\"",
value: "20150304T184500",
})
);
assert_eq!(parser.next(), None);
}
#[test]
fn test_continuation_line() {
// A line with ending in CRLF is a different code-path than a line in EOF.
let data = vec![
"X-JMAP-LOCATION;VALUE=TEXT;X-JMAP-GEO=\"geo:52.123456,4.123456\";",
" X-JMAP-ID=03453afa-71fc-4893-ba70-a7436bb6d56c:Name of place",
"X-JMAP-LOCATION;VALUE=TEXT;X-JMAP-GEO=\"geo:52.123456,4.123456\";",
" X-JMAP-ID=03453afa-71fc-4893-ba70-a7436bb6d56c:Name of place",
]
.join("\r\n");
let mut parser = Parser::new(&data);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: &vec![
"X-JMAP-LOCATION;VALUE=TEXT;X-JMAP-GEO=\"geo:52.123456,4.123456\";",
" X-JMAP-ID=03453afa-71fc-4893-ba70-a7436bb6d56c:Name of place"
]
.join("\r\n"),
name: "X-JMAP-LOCATION",
params: "VALUE=TEXT;X-JMAP-GEO=\"geo:52.123456,4.123456\";\r\n X-JMAP-ID=03453afa-71fc-4893-ba70-a7436bb6d56c",
value: "Name of place",
})
);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: &vec![
"X-JMAP-LOCATION;VALUE=TEXT;X-JMAP-GEO=\"geo:52.123456,4.123456\";",
" X-JMAP-ID=03453afa-71fc-4893-ba70-a7436bb6d56c:Name of place"
]
.join("\r\n"),
name: "X-JMAP-LOCATION",
params: "VALUE=TEXT;X-JMAP-GEO=\"geo:52.123456,4.123456\";\r\n X-JMAP-ID=03453afa-71fc-4893-ba70-a7436bb6d56c",
value: "Name of place",
})
);
assert_eq!(parser.next(), None);
}
#[test]
fn test_invalid_lone_name() {
let data = "BEGIN";
let mut parser = Parser::new(&data);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "BEGIN",
name: "BEGIN",
params: "",
value: "",
})
);
assert_eq!(parser.next(), None);
}
#[test]
fn test_invalid_name_with_params() {
let data = "DTSTART;TZID=America/New_York";
let mut parser = Parser::new(&data);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "DTSTART;TZID=America/New_York",
name: "DTSTART",
params: "TZID=America/New_York",
value: "",
})
);
assert_eq!(parser.next(), None);
}
#[test]
fn test_invalid_name_with_trailing_semicolon() {
let data = "DTSTART;";
let mut parser = Parser::new(&data);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "DTSTART;",
name: "DTSTART",
params: "",
value: "",
})
);
assert_eq!(parser.next(), None);
}
#[test]
fn test_invalid_name_with_trailing_colon() {
let data = "DTSTART:";
let mut parser = Parser::new(&data);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "DTSTART:",
name: "DTSTART",
params: "",
value: "",
})
);
assert_eq!(parser.next(), None);
}
#[test]
fn test_remainder() {
let data = vec!["BEGIN:VTODO", "SUMMARY:Do the thing"].join("\r\n");
let mut parser = Parser::new(&data);
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "BEGIN:VTODO",
name: "BEGIN",
params: "",
value: "VTODO",
})
);
assert_eq!(parser.remainder(), "SUMMARY:Do the thing");
assert_eq!(
parser.next(),
Some(ContentLine {
raw: "SUMMARY:Do the thing",
name: "SUMMARY",
params: "",
value: "Do the thing",
})
);
assert_eq!(parser.next(), None);
}
#[test]
fn test_fold_multiline() {
assert_eq!(
fold_lines("UID:horrible-\r\n example"),
"UID:horrible-example"
);
assert_eq!(fold_lines("UID:X\r\n Y"), "UID:XY");
assert_eq!(fold_lines("UID:X\r\n "), "UID:X");
assert_eq!(
fold_lines("UID:quite\r\n a\r\n few\r\n lines"),
"UID:quiteafewlines"
);
}
#[test]
#[should_panic]
fn test_fold_multiline_missing_whitespace() {
fold_lines("UID:two\r\nlines");
}
}
/// Fold multiple continuation lines into a single line.
///
/// # Panics
///
/// If the input string has multiple non-continuation lines.
fn fold_lines(lines: &str) -> Cow<str> {
let mut result = Cow::Borrowed(lines);
let mut cur = 0;
let mut chars = lines.char_indices().peekable();
while let Some((i, c)) = chars.next() {
if c != '\r' {
continue;
}
if !matches!(chars.peek(), Some((_, '\n'))) {
continue; // Not CRLF.
};
chars.next(); // Advance the peeked LF.
assert!(
matches!(chars.next(), Some((_, ' ' | '\t'))),
"continuation line is not a continuation line",
);
let portion = &lines[cur..i];
match result {
Cow::Borrowed(_) => {
result = Cow::Owned(portion.to_owned());
}
Cow::Owned(ref mut s) => {
s.push_str(portion);
}
}
cur = i + 3;
}
if let Cow::Owned(ref mut s) = result {
let portion = &lines[cur..];
s.push_str(portion);
}
result
}