~maxgyver83/emailbook-janet

c954334ab4dbf979feccbb6c9cd0427eb290a569 — Max Schillinger 6 months ago 2d20cf3
Skip mailboxes containing non-ASCII characters
1 files changed, 14 insertions(+), 0 deletions(-)

M emailbook.janet
M emailbook.janet => emailbook.janet +14 -0
@@ 54,6 54,7 @@ Options:
    :iso (sequence (+ "ISO" "iso") "-8859-1" (? "5"))
    :win (sequence (set "Ww") "indows-1252")
    :charset (sequence "=?" (+ :iso :win) "?Q?")}))
(def no-ascii (peg/compile ~(range "\x80\xFF")))

(defn second [xs] (get xs 1))
(defn third [xs] (get xs 2))


@@ 228,6 229,14 @@ Options:

(test (decode-iso8859-q "=?iso-8859-1?Q?M=FCller?=") "Müller")

(defn invalid-ascii [str]
  (peg/find no-ascii str))

(test (invalid-ascii "hello") nil)
(test (invalid-ascii "für") 1)
(test (invalid-ascii "Größe") 2)
(test (invalid-ascii "Bestätigung") 4)

(defn sanitize [mailbox]
  (var sanitized mailbox)
  # ignore addresses containing noreply, no-reply or no_reply


@@ 298,6 307,11 @@ Options:
      (var results (match-mailboxes line))
      (if results
        (each mailbox results
          # headers must use only ASCII characters (0x00 - 0x7F)
          (when (invalid-ascii mailbox)
            (eprintf "No valid ASCII: %s (skipped)" mailbox)
            (break nil)) # "continue" would be useful here

          (def mailbox-sanitized (-> mailbox decode-utf8 decode-iso8859-q sanitize))
          (cond
            (nil? mailbox-sanitized)