~ehmry/dhall-nim

35f2b5924098a349f7f4ab790ad495dd5b64b3e9 — Emery Hemingway 1 year, 1 month ago 69f135c
NPeg parser
3 files changed, 917 insertions(+), 2 deletions(-)

M dhall.nimble
A src/dhall/parse.nim
M tests/test1.nim
M dhall.nimble => dhall.nimble +1 -1
@@ 10,4 10,4 @@ srcDir        = "src"

# Dependencies

requires "nim >= 1.2.0", "bigints", "cbor"
requires "nim >= 1.2.0", "bigints", "cbor", "npeg"

A src/dhall/parse.nim => src/dhall/parse.nim +886 -0
@@ 0,0 1,886 @@
import term

import bigints, npeg, npeg/lib/uri, npeg/lib/utf8

import algorithm, math, options, parseutils, tables, unicode

func isHex(s: string): bool {.inline.} =
  s.len > 2 and s[0] == '0' and s[1] == 'x'

proc parseBigInt(s: string): BigInt =
  if s.isHex:
    initBigInt(s[2..s.high], 16)
  else:
    initBigInt(s, 10)

when not defined(release):
  template dumpCapture(label: string) =
    echo label, " stack: ", stack
    if capture.len == 1:
      echo label, " capture: ", capture[0].s
    else:
      echo label, " capture:"
      for i in 0..<capture.len:
        echo label, "\t$", i, " ", capture[i].s

template backtrack(n = 1) =
  stack.setLen(stack.len-n)
  fail()

proc flattenOperator(stack: var seq[Term]; kind: OpKind; n: int) =
  assert(n < stack.len)
  let off = stack.high-n
  for i in off+1..stack.high:
    stack[off] = Term(kind: tOp, op: kind, opL: move(stack[off]), opR: move(stack[i]))
  stack.setLen(stack.len-n)

template flattenOperator(stack: seq[Term]; kind: OpKind) =
  let n = capture.len-1
  if n > 0:
    flattenOperator(stack, kind, n)

template push(t: Term) = add(stack, t)

template appendTextLiteral(s: string) =
  # TODO: don't parse rune-by-rune
  if stack.len > 0 and
      stack[stack.high].kind == tTextChunk and
          stack[stack.high].textExpr.isNil:
    stack[stack.high].textPrefix.add s
  else:
    push Term(kind: tTextChunk, textPrefix: s)

proc joinTextChunks(stack: var seq[Term]) =
    var n: int
    while n < stack.len and stack[stack.high-n].kind == tTextChunk:
      inc n
    let t = Term(kind: tTextLiteral,
        textChunks: newSeq[Term](n), textSuffix: "")
    let chunkOff = stack.len-n
    for i in 0..<n:
      t.textChunks[i] = move stack[chunkOff+i]
    if t.textChunks.len > 0 and t.textChunks[t.textChunks.high].textExpr.isNil:
      t.textSuffix = move t.textChunks.pop.textPrefix
    stack.setLen(succ chunkOff)
    stack[chunkOff] = t

type IndentParser = tuple[indent: string, tailLine: bool]

# TODO: break windows compatiblity in release builds

proc parse(ip: var IndentParser; s: string) =
  ## Determine the common indentation level of ``s``.
  var i: int
  if not ip.tailLine:
    i = s.skipWhile({'\r','\n'}, 0)
      # Skip empty lines.
    i.inc(s.parseWhile(ip.indent, {'\t',' '}, i))
      # Take the leading whitespace of the first non-empty line.
    ip.tailLine = true
  while i < s.len:
    let lineLen = s.skipUntil({'\r','\n'}, i)
    i.inc(lineLen)
    i.inc(s.skipWhile({'\r','\n'}, i))
      # Skip empty lines.
    let remain = s.len-i
    if 0 < remain and remain < ip.indent.len:
      ip.indent.setLen(remain)
    for j in 0..ip.indent.high:
      # Drop characters from indentation
      # not matching successive lines.
      if i+j < s.len and ip.indent[j] != s[i+j]:
        ip.indent.setLen(j)
        break
    i.inc(ip.indent.len)

proc dedent(headLine: var bool; s: var string; n: int) =
  ## Remove ``n`` leading whitespace characters from every line.
  assert(n > 0)
  var j: int
  var i = if headLine: n else: 0
  headLine = false
  while i < s.len:
    while i < s.len and s[i] notin {'\r','\n'}:
      s[j] = s[i]
      inc j
      inc i
    while i < s.len and s[i] in {'\r','\n'}:
      if s[i] != '\r':
        s[j] = '\n'
        inc j
      inc i
    inc i, n
  s.setLen(j)

let parser = peg("final_expression", stack: seq[Term]):

  final_expression <- complete_expression * !1

  complete_expression <- whsp * expression * whsp

  expression <-
      lamba_expression |
      if_expression |
      let_bindings |
      forall_expression |
      arrow_expression |
      with_expression |
      merge_annotated_expression |
      empty_list_literal |
      toMap_annotated_expression |
      assert_expression |
      annotated_expression

  lamba_expression <- lambda * whsp * '(' * whsp * nonreserved_label * whsp * ':' * whsp1 * expression * whsp * ')' * whsp * arrow * whsp * expression:
    push Term(kind: tLambda,
        lambdaLabel: move($1), lambdaBody: stack.pop(), lambdaType: stack.pop())

  if_expression <- If * whsp1 * expression * whsp * Then * whsp1 * expression * whsp * Else * whsp1 * expression:
    push Term(kind: tIf, ifFalse: stack.pop, ifTrue: stack.pop, ifCond: stack.pop)

  let_bindings <- +let_binding * In * whsp1 * expression:
    var n: int
    for i in countDown(stack.high.pred, 0):
      if stack[i].kind != tBinding:
        break
      inc n
    var t = Term(kind: tLet, letBinds: newSeq[Term](n), letBody: stack.pop())
    for i in 0..<n:
      t.letBinds[i] = move stack[stack.len-n+i]
    if t.letBody.kind == tLet:
      t.letBinds = t.letBinds & t.letBody.letBinds
      t.letBody = t.letBody.letBody
    stack.setLen(stack.len-n)
    push t

  forall_expression <- forall * whsp * '(' * whsp * (>'_' | nonreserved_label) * whsp * ':' * whsp1 * expression * whsp * ')' * whsp * arrow * whsp * expression:
    push Term(kind: tPi,
        piBody: stack.pop(),
        piType: stack.pop(),
        piLabel: move($1))

  toMap_annotated_expression <- toMap_expression * ?(whsp * >':' * whsp1 * application_expression):
    if capture.len != 2:
      backtrack()
    stack[pred stack.high].toMapAnn = stack.pop()

  Import <- import_hashed * ?( whsp * As * whsp1 * >(Text | Location) ):
    case capture.len
    of 1: discard
    of 2:
      stack[stack.high].importKind = case $1
        of "Text": iText
        of "Location": iLocation
        else: fail()
    else: fail()

  assert_expression <- Assert * whsp * ':' * whsp1 * expression:
    push Term(kind: tAssert, assertAnn: stack.pop())

  annotated_expression <- operator_expression * ?( whsp * >':' * whsp1 * expression ):
    case capture.len
    of 1: discard
    of 2:
      push Term(kind: tAnnotation,
          annAnn: stack.pop(), annExpr: stack.pop())
    else: fail()

  let_binding <- Let * whsp1 * nonreserved_label * whsp * ?( >':' * whsp1 * expression * whsp ) * '=' * whsp * expression * whsp:
    case capture.len
    of 2:
      push Term(kind: tBinding,
          key: move($1), val: stack.pop())
    of 3:
      push Term(kind: tBinding,
          key: move($1), val: stack.pop(), ann: stack.pop())
    else: fail()

  arrow_expression <- operator_expression * ?(whsp * >arrow * whsp * expression):
    # TODO: bypass operator_expression?
    if capture.len != 2:
      backtrack()
    push Term(kind: tPi,
        piLabel: "_", piBody: stack.pop(), piType: stack.pop())

  with_expression <- import_expression * *(whsp1 * >with * whsp1 * with_clause):
    if capture.len == 1:
      backtrack()
    let n = capture.len-1
    if n > 0:
      let off = stack.high-n
      for i in off+1..stack.high:
        let t = stack[i]
        let record = Term(kind: tRecordLiteral,
          recordLiteral: toTable [(move t.entryKey, move t.entryVal)])
        stack[off] = Term(kind: tOp,
            op: opRecordBiasedMerge,
            opL: move(stack[off]),
            opR: record)
      stack.setLen(stack.len-n)

  merge_annotated_expression <- merge_expression * ?(whsp * >':' * whsp1 * application_expression):
    if capture.len != 2:
      backtrack()
    stack[pred stack.high].mergeAnn = stack.pop()

  empty_list_literal <- '[' * whsp * ?( ',' * whsp ) * ']' * whsp * ':' * whsp1 * application_expression:
    let listType = stack.pop
    if listType.kind == tApp and
        listType.appFun.kind == tBuiltin and
            listType.appFun.builtin == bList:
      # applicaton of the List builtin
      if listType.appArgs.len == 1:
        # eliminate the List builtin and promote the single argument
        push Term(kind: tList, listType: move listType.appArgs[0])
      else:
        push Term(kind: tEmptyList, emptyListType: listType)
    else:
      push Term(kind: tEmptyList, emptyListType: listType)

  single_quote_continue <- interpolation * single_quote_continue |
      escaped_quote_pair * single_quote_continue |
      escaped_interpolation * single_quote_continue |
      "''" |
      single_quote_char * single_quote_continue

  double_quote_chunk <- interpolation | double_quote_escaping | double_quote_char

  double_quote_escaping <- '\\' * double_quote_escaped

  double_quote_char <- {'\x20'..'\x21'} | {'\x23'..'\x5b'} | {'\x5d'..'\x7e'} | valid_non_ascii:
    appendTextLiteral(move $0)

  double_quote_escaped <- double_quote_escaped_char | double_quote_escaped_unicode

  double_quote_escaped_char <- '"' | '$' | '\\' | '/' | 'b' | 'f' | 'n' | 'r' | 't':
    case $0
    of "b": appendTextLiteral("\b")
    of "f": appendTextLiteral("\f")
    of "n": appendTextLiteral("\n")
    of "r": appendTextLiteral("\r")
    of "t": appendTextLiteral("\t")
    else: appendTextLiteral(move $0)

  double_quote_escaped_unicode <- 'u' * unicode_escape:
    var r: uint32
    validate(parseHex($1, r) == len($1))
    appendTextLiteral(Rune(r).toUtf8)

  unicode_escape <- >unbraced_escape | ('{' * >braced_escape * '}')

  unicode_suffix <- ((Digit | {'A'..'E'}) * Xdigit[3]) | ('F' * Xdigit[2] * (Digit | {'A'..'D'}))

  unbraced_escape <- ((Digit | {'A'..'C'}) * Xdigit[3]) |
      ('D' * {'0'..'7'} * Xdigit[2]) |
      ('E' * Xdigit[3]) |
      ('F' * Xdigit[2] * (Digit | {'A'..'D'}))

  braced_codepoint <- (({'1'..'9'} | {'A'..'F'} | "10") * unicode_suffix) | unbraced_escape | Xdigit[1..3]

  braced_escape <- *'0' * braced_codepoint

  escaped_quote_pair <- "'''":
    appendTextLiteral("''")

  escaped_interpolation <- "''${":
    appendTextLiteral("${")

  single_quote_char <- Print | valid_non_ascii | tab | end_of_line:
    appendTextLiteral(move $0)

  interpolation <- "${" * complete_expression * '}':
    let textExpr = stack.pop()
    if stack.len > 0 and
        stack[stack.high].kind == tTextChunk and
            stack[stack.high].textExpr.isNil:
      stack[stack.high].textExpr = textExpr
    else:
      push Term(kind: tTextChunk, textExpr: textExpr)

  pos_or_neg <- '+' | '-'

  exponent <- 'e' * ?pos_or_neg * +Digit

  numeric_double_literal <- ?pos_or_neg * +Digit * (('.' * +Digit * ?exponent) | exponent):
    var t = Term(kind: tDoubleLiteral)
    if parseBiggestFloat($0, t.double) > 0 and
        classify(t.double) in {fcNormal,fcZero,fcNegZero}:
      push t
    else:
      validate(false)

  minus_infinity_literal <- '-' * Infinity:
    push Term(kind: tDoubleLiteral, double: system.NegInf)

  plus_infinity_literal <- Infinity:
    push Term(kind: tDoubleLiteral, double: system.Inf)

  NaN_literal <- NaN:
    push Term(kind: tDoubleLiteral, double: system.NaN)

  import_type <- missing | local | http | env

  path <- +path_component:
    let t = Term(kind: tImport,
        importElements: newSeq[string](capture.len-1))
    for i in 1..<capture.len:
      t.importElements[pred i] = move capture[i].s
    push t

  path_component <- '/' * ( >unquoted_path_component | ('"' * >quoted_path_component * '"') )

  quoted_path_component <- +quoted_path_character

  quoted_path_character <- '\x20' | '\x21' | {'\x23'..'\x2e'} | {'\x30'..'\x7e'} | valid_non_ascii

  unquoted_path_component <- +path_character

  path_character <- '\x21' |
    {'\x24'..'\x27'} |
    {'\x2a'..'\x2b'} |
    {'\x2d'..'\x2e'} |
    {'\x30'..'\x3b'} |
    '\x3d' |
    {'\x40'..'\x5a'} |
    {'\x5e'..'\x7a'} |
    '\x7c' |
    '\x7e'

  local <- parent_path | here_path | home_path | absolute_path

  absolute_path <- path:
    stack[stack.high].importScheme = 2

  here_path <- '.' * path:
    stack[stack.high].importScheme = 3

  parent_path <- ".." * path:
    stack[stack.high].importScheme = 4

  home_path <- '~' * path:
    stack[stack.high].importScheme = 5

  scheme <- "http" * ?'s'

  query <- *( uri.pchar | "/" | "|" | "?" )

  http_raw <- >scheme * "://" * (>uri.authority * >uri.path) * ?( '?' * >query):
    let t = Term(kind: tImport,
        importScheme: case $1
            of "http": 0
            of "https": 1
            else: fail())
    if $3 != "":
      t.importElements = split($3, Rune('/'))
      t.importElements[0] = move($2)
    else:
      t.importElements = @[move($2), ""]
    if capture.len > 4:
      t.importQuery = some move($4)
    push t

  http <- http_raw * ?( whsp * >Using * whsp1 * import_expression ):
    case capture.len
    of 1: discard
    of 2:
      assert(stack.len > 1)
      stack[pred stack.high].importHeaders = stack.pop()
    else: fail()

  env <- "env:" * (bash_environment_variable | posix_environment_variable)

  missing <- "missing" * !simple_label_next_char:
    push Term(kind: tImport, importScheme: 7)

  bash_environment_variable <- >( (Alpha | '_') * *(Alnum | '_') ):
    push Term(kind: tImport,
        importScheme: 6, importElements: @[move($1)])

  posix_environment_variable <- '"' * >( +posix_environment_variable_character ) * '"':
    let s = $1
    if s[s.high] == '\\': fail()
    var ev = newStringOfCap(s.len)
    var i = 0
    while i < s.high:
      if s[i] == '\\':
        case s[i+1]:
        of '"': ev.add('"')
        of '\\': ev.add('\\')
        of 'a': ev.add('\x07')
        of 'b': ev.add('\x08')
        of 'f': ev.add('\x0c')
        of 'n': ev.add('\x0a')
        of 'r': ev.add('\x0d')
        of 't': ev.add('\x09')
        of 'v': ev.add('\x0b')
        else: fail()
        inc(i, 2)
      else:
        ev.add(s[i])
        inc(i)
    ev.add(s[s.high])
    push Term(kind: tImport,
        importScheme: 6, importElements: @[ev])

  posix_environment_variable_character <- ('\\' *
      ( '"' | '\\' | 'a' | 'b' | 'f' | 'n' | 'r' | 't' | 'v')) |
      {'\x20'..'\x21'} |
      {'\x23'..'\x3c'} |
      {'\x3e'..'\x5b'} |
      {'\x5d'..'\x7e'}

  hash <- "sha256:" * >Xdigit[64]:
    var check = newSeq[byte](2+32)
    check[0] = 0x12'u8
    check[1] = 0x20'u8
    for i in 0..31:
      validate(parseHex($1, check[2+i], 2*i, 2) == 2)
    stack[stack.high].importCheck = check

  import_hashed <- import_type * ?( whsp1 * hash )

  with_clause <- any_label_or_some * *(whsp * '.' * whsp * any_label_or_some) * whsp * '=' * whsp * operator_expression:
    var tR = stack.pop()
    if capture.len > 2:
      # TODO: dubious implementation
      tR = Term(kind: tRecordLiteral,
          recordLiteral: toTable [(capture[pred capture.len].s, tR)])
    for i in countDown(capture.len-2, 1):
      var tL = stack[stack.high]
      if tL.kind == tEntry:
        tL = Term(kind: tOp,
          op: opRecordBiasedMerge,
          opL: stack[pred stack.high],
          opR: Term(kind: tRecordLiteral,
              recordLiteral: toTable [(tL.entryKey, tL.entryVal)]))
      for j in countUp(1, i):
        tL = Term(kind: tField,
            fieldRecord: tL,
            fieldName: capture[j].s)
      tR = Term(kind: tOp,
          op: opRecordBiasedMerge,
          opL: tL, opR: tR)
      if 1 < i:
        tR = Term(kind: tRecordLiteral,
            recordLiteral: toTable [(capture[i].s, tR)])
    push Term(kind: tEntry,
        entryKey: move capture[1].s, entryVal: tR)

  not_equal_expression <- application_expression * *(whsp * >"!=" * whsp * application_expression):
    stack.flattenOperator(opBoolInequality)

  equal_expression <- not_equal_expression * *(whsp * >"==" * whsp * not_equal_expression):
    stack.flattenOperator(opBoolEquality)

  times_expression <- equal_expression * *(whsp * >'*' * whsp * equal_expression):
    stack.flattenOperator(opNaturalMultiplication)

  combine_types_expression <- times_expression * *(whsp * >combine_types * whsp * times_expression):
    stack.flattenOperator(opRecordTypeMerge)

  prefer_expression <- combine_types_expression * *(whsp * >prefer * whsp * combine_types_expression):
    stack.flattenOperator(opRecordBiasedMerge)

  combine_expression <- prefer_expression * *(whsp * >combine * whsp * prefer_expression):
    stack.flattenOperator(opRecordRecursiveMerge)

  and_expression <- combine_expression * *(whsp * >"&&" * whsp * combine_expression):
    stack.flattenOperator(opBoolAnd)

  list_append_expression <- and_expression * *(whsp * >'#' * whsp * and_expression):
    stack.flattenOperator(opListAppend)

  text_append_expression <- list_append_expression * *(whsp * >"++" * whsp * list_append_expression):
    stack.flattenOperator(opTextAppend)

  plus_expression <- text_append_expression * *(whsp * >'+' * whsp1 * text_append_expression):
    stack.flattenOperator(opNaturalAdd)

  or_expression <- plus_expression * *(whsp * >"||" * whsp * plus_expression):
    stack.flattenOperator(opBoolOr)

  import_alt_expression <- or_expression * *(whsp * >'?' * whsp1 * or_expression):
    stack.flattenOperator(opAlternateImport)

  equivalence_expression <- import_alt_expression * *(whsp * >equivalence * whsp * import_alt_expression):
    stack.flattenOperator(opEquivalience)

  operator_expression <- equivalence_expression

  first_application_expression <-
      merge_expression |
      Some_expression |
      toMap_expression |
      import_expression

  application_expression <- first_application_expression * *(whsp1 * >import_expression):
    if capture.len > 1:
      let stackOff = stack.high - (capture.len - 1)
      let t = Term(kind: tApp,
        appFun: move(stack[stackOff]),
        appArgs: stack[stackOff+1..stack.high])
      stack.setLen(stackOff+1)
      stack[stackOff] = t

  merge_expression <- merge * whsp1 * import_expression * whsp1 * import_expression:
    push Term(kind: tMerge,
        mergeUnion: stack.pop(), mergeHandler: stack.pop())

  Some_expression <- Some * whsp1 * import_expression:
    push Term(kind: tSome, someVal: stack.pop())

  toMap_expression <- toMap * whsp1 * import_expression:
    push Term(kind: tToMap, toMapBody: stack.pop())

  primitive_expression <-
      double_literal |
      natural_literal |
      integer_literal |
      text_literal |
      bool_literal |
      record_literal |
      record_type |
      union_type |
      non_empty_list_literal |
      identifier |
      ( '(' * complete_expression * ')' )

  selector_expression <- primitive_expression * *(whsp * '.' * whsp * selector)
    # NOTE: Backtrack when parsing the `*("." ...)`.  The reason why is that you
    # can't tell from parsing just the period whether "foo." will become "foo.bar"
    # (i.e. accessing field `bar` of the record `foo`) or `foo./bar` (i.e. applying
    # the function `foo` to the relative path `./bar`)

  completion_expression <- selector_expression * ?( whsp * >complete * whsp * selector_expression ):
    if capture.len == 2:
      push Term(kind: tOp, op: opComplete, opR: stack.pop, opL: stack.pop)

  import_expression <- Import | completion_expression

  false_literal <- False * !simple_label_next_char:
    push Term(kind: tBoolLiteral, bool: false)

  true_literal <- True * !simple_label_next_char:
    push Term(kind: tBoolLiteral, bool: true)

  bool_literal <- false_literal | true_literal

  double_literal <- numeric_double_literal |
      minus_infinity_literal | plus_infinity_literal | NaN_literal

  integer_literal <- >pos_or_neg * >natural:
    var t =  Term(kind: tIntegerLiteral, integer: parseBigInt($2))
    if $1 == "-": t.integer.flags = {Negative}
    push t

  natural_literal <- >natural:
    push Term(kind: tNaturalLiteral, natural: parseBigInt($1))

  text_literal <- double_quote_literal | single_quote_literal

  double_quote_literal <- '"' * *double_quote_chunk * '"':
    joinTextChunks(stack)

  single_quote_literal <- "''" * end_of_line * single_quote_continue:
    joinTextChunks(stack)
    # Longest common whitespace indentation
    # is stripped from single quote literals.
    let literal = stack[stack.high]
    if literal.textSuffix[literal.textSuffix.high] != '\n':
      # Only if the closing '' has leading whitespace.
      var ip: IndentParser
      for tc in literal.textChunks:
        ip.parse(tc.textPrefix)
      ip.parse(literal.textSuffix)
      if 0 < ip.indent.len:
        var headLine = true
        for tc in literal.textChunks.mitems:
          dedent(headLine, tc.textPrefix, ip.indent.len)
        dedent(headLine, literal.textSuffix, ip.indent.len)

  identifier <- variable | identifier_builtin

  identifier_builtin <- builtin:
    push Term(kind: tBuiltin, builtin: parseBuiltin($0))

  variable <- nonreserved_label * ?(whsp * '@' * whsp * >natural):
    var t = Term(kind: tVar, varName: move($1))
    if capture.len == 3:
      let n = if isHex($2):
          parseHex($2, t.varIndex)
        else:
          parseInt($2, t.varIndex)
      validate(n == len($2))
    push t

  natural <- ("0x" * +Xdigit) | (!'0' * {'1'..'9'} * *Digit) | ('0' * !Digit)

  selector <- label_selector | fields_selector | type_selector

  label_selector <- any_label:
    for i in 1..<capture.len:
      push Term(kind: tField, fieldRecord: stack.pop(), fieldName: move capture[i].s)

  labels <- '{' * whsp * ?( any_label_or_some * whsp * *(',' * whsp * any_label_or_some * whsp) ) * '}'

  fields_selector <- labels:
    var t = Term(kind: tProject,
        projectRecord: stack.pop(),
        projectNames: newSeq[string](pred capture.len))
    for i in 1..<capture.len:
      t.projectNames[pred i] = move capture[i].s
    push t

  type_selector <- '(' * whsp * expression * whsp * ')':
    push Term(kind: tProjectType,
        projectTypeSelector: stack.pop(),
        projectTypeRecord: stack.pop())

  record_type <- '{' * whsp * ?( ',' * whsp ) * (non_empty_record_type | empty_record_type) * whsp * '}'

  record_literal <- '{' * whsp * ?( ',' * whsp ) * (non_empty_record_literal | empty_record_literal) * whsp * '}'

  empty_record_literal <- '=':
    push Term(kind: tRecordLiteral, recordLiteral: initTable[string, Term](2))

  empty_record_type <- 0:
    push Term(kind: tRecordType, recordType: initTable[string, Term](2))

  non_empty_record_type <- record_type_entry * *(whsp * >',' * whsp * record_type_entry):
    # capture the commas as a hack to figure
    # the number of record entries on the stack
    let n = capture.len
    var t = Term(kind: tRecordType,
        recordType: initTable[string, Term](nextPowerOfTwo n))
    for i in stack.len-n..stack.high:
      if t.recordType.hasKey stack[i].entryKey:
        t.recordType[stack[i].entryKey] = Term(kind: tOp,
            op: opRecordTypeMerge,
            opL: t.recordType[stack[i].entryKey],
            opR: stack[i].entryVal)
      else:
        t.recordType.add(stack[i].entryKey, stack[i].entryVal)
    stack.setLen(stack.len-n)
    push t

  record_type_entry <- any_label_or_some * whsp * ':' * whsp1 * expression:
    push Term(kind: tEntry, entryKey: move($1), entryVal: stack.pop())

  non_empty_record_literal <- record_literal_entry * *(whsp * >',' * whsp * record_literal_entry):
    # capture the commas as a hack to figure
    # the number of record entries on the stack
    let n = capture.len
    var t = Term(kind: tRecordLiteral,
        recordLiteral: initTable[string, Term](nextPowerOfTwo n))
    for i in stack.len-n..stack.high:
      if t.recordLiteral.hasKey stack[i].entryKey:
        t.recordLiteral[stack[i].entryKey] = Term(kind: tOp,
            op: opRecordRecursiveMerge,
            opL: t.recordLiteral[stack[i].entryKey],
            opR: stack[i].entryVal)
      else:
        t.recordLiteral.add(stack[i].entryKey, stack[i].entryVal)
    stack.setLen(stack.len-n)
    push t

  record_literal_entry <- record_literal_normal_entry | record_literal_punned_entry

  record_literal_normal_entry <- any_label_or_some * *(whsp * '.' * whsp * any_label_or_some) * whsp * '=' * whsp * expression:
    var t = stack.pop()
    for i in countDown(capture.len.pred, 2):
      t = Term(kind: tRecordLiteral,
          recordLiteral: toTable [(move capture[i].s, t)])
    push Term(kind: tEntry, entryKey: move($1), entryVal: t)

  record_literal_punned_entry <- any_label_or_some * !(whsp * ':') * 0:
    var t = Term(kind: tVar, varName: $1)
    push Term(kind: tEntry, entryKey: $1, entryVal: t)

  union_type <- '<' * whsp * ?( '|' * whsp ) * (non_empty_union_type | empty_union_type) * whsp * '>'

  empty_union_type <- 0:
    push Term(kind: tUnionType, union: initTable[string, Term](2))

  non_empty_union_type <- union_type_entry * *(whsp * >'|' * whsp * union_type_entry):
    # capture the bars as a hack to figure
    # the number of union entries on the stack
    let n = capture.len
    var t = Term(kind: tUnionType,
        union: initTable[string, Term](nextPowerOfTwo n))
    for i in stack.len-n..stack.high:
      validate(not t.union.hasKey(stack[i].entryKey))
      t.union.add(stack[i].entryKey, stack[i].entryVal)
    stack.setLen(stack.len-n)
    push t

  union_type_entry <- any_label_or_some * ?( whsp * >':' * whsp1 * expression ):
    push Term(kind: tEntry,
        entryKey: move($1),
        entryVal: case capture.len
            of 2: nil
            of 3: stack.pop()
            else: fail())

  non_empty_list_literal <- '[' * whsp * ?( ',' * whsp ) * expression * whsp * *(>',' * whsp * expression * whsp) * ']':
    let
      n = capture.len
      off = stack.len-n
      t = Term(kind: tList, list: newSeq[Term](n))
    for i in 0..<n:
      t.list[i] = move stack[off+i]
    stack[off] = t
    stack.setLen(succ off)

  nonreserved_label <- >(builtin * +simple_label_next_char) | (!builtin * label)

  any_label_or_some <- any_label | >Some

  any_label <- label

  label <- quoted_label | simple_label

  quoted_label <- '`' * >(*quoted_label_char) * '`'

  quoted_label_char <- {'\x20'..'\x5f'} | {'\x61'..'\x7e'}
    # omit the ` char

  simple_label_first_char <- Alpha | '_'
  simple_label_next_char <- Alnum | '-' | '/' | '_'
  simple_label <- >( (keyword * +simple_label_next_char) |
      (!keyword * simple_label_first_char * *simple_label_next_char) )

  If       <- "if"
  Then     <- "then"
  Else     <- "else"
  Let      <- "let"
  In       <- "in"
  As       <- "as"
  Using    <- "using"
  merge    <- "merge"
  Infinity <- "Infinity"
  NaN      <- "NaN"
  Some     <- "Some"
  toMap    <- "toMap"
  Assert   <- "assert"
  forall   <- "∀" | "forall"
  with     <- "with"

  keyword <- If | Then | Else | Let | In | As | Using | merge | "missing" | Infinity | NaN | Some | toMap | Assert | forall | with

  Optional <- "Optional"
  Text     <- "Text"
  List     <- "List"
  Location <- "Location"

  Bool    <- "Bool"
  True    <- "True"
  False   <- "False"
  None    <- "None"
  Natural <- "Natural"
  Integer <- "Integer"
  Double  <- "Double"
  Type    <- "Type"
  Kind    <- "Kind"
  Sort    <- "Sort"
  Natural_fold      <- "Natural/fold"
  Natural_build     <- "Natural/build"
  Natural_isZero    <- "Natural/isZero"
  Natural_even      <- "Natural/even"
  Natural_odd       <- "Natural/odd"
  Natural_toInteger <- "Natural/toInteger"
  Natural_show      <- "Natural/show"
  Natural_subtract  <- "Natural/subtract"
  Integer_toDouble  <- "Integer/toDouble"
  Integer_show      <- "Integer/show"
  Integer_negate    <- "Integer/negate"
  Integer_clamp     <- "Integer/clamp"
  Double_show       <- "Double/show"
  List_build        <- "List/build"
  List_fold         <- "List/fold"
  List_length       <- "List/length"
  List_head         <- "List/head"
  List_last         <- "List/last"
  List_indexed      <- "List/indexed"
  List_reverse      <- "List/reverse"
  Optional_fold     <- "Optional/fold"
  Optional_build    <- "Optional/build"
  Text_show         <- "Text/show"

  combine <- "∧" | "/\\"
  combine_types <- "⩓" | "//\\\\"
  equivalence <- "≡" | "==="
  prefer <- "⫽" | "//"
  lambda <- "λ" | '\\'
  arrow <- "→" | "->"
  complete <- "::"

  list_builtin <- List_build | List_fold | List_length | List_head | List_last | List_indexed | List_reverse | List

  natural_builtin <- Natural_fold | Natural_build | Natural_isZero | Natural_even | Natural_odd | Natural_toInteger | Natural_show | Natural_subtract | Natural

  integer_builtin <- Integer_toDouble | Integer_show | Integer_negate | Integer_clamp | Integer

  builtin <- list_builtin | natural_builtin | integer_builtin | Double_show | Optional_fold | Optional_build | Text_show | Bool | True | False | Optional | None | Double | Text | Type | Kind | Sort
  tab <- '\t'

  end_of_line <- '\n' | "\r\n" # fuck off windows

  line_comment <- "--" * *not_end_of_line * end_of_line

  not_end_of_line <- Print | valid_non_ascii | tab

  block_comment_continue <-
    "-}" | (block_comment * block_comment_continue) | (block_comment_char * block_comment_continue)


  block_comment <- "{-" * block_comment_continue

  block_comment_char <-
    Print | valid_non_ascii | tab | end_of_line

  whsp <- *whitespace_chunk

  whsp1 <- +whitespace_chunk

  whitespace_chunk <- ' ' | tab | end_of_line | line_comment | block_comment

  valid_non_ascii <- >utf8.any:
    ## This rule matches all characters that are not:
    ## * not ASCII
    ## * not part of a surrogate pair
    ## * not a "non-character"
    template exclude(a, b: int32) =
      let r = runeAt($1, 0)
      validate(not (cast[Rune](a) <=% r and r <=% cast[Rune](b)))
    exclude(0x0, 0x7f)
    exclude(0xd800, 0xdfff)
    exclude(0xfffe, 0xffff)
    exclude(0x1fffe, 0x1ffff)
    exclude(0x2fffe, 0x2ffff)
    exclude(0x3fffe, 0x3ffff)
    exclude(0x4fffe, 0x4ffff)
    exclude(0x5fffe, 0x5ffff)
    exclude(0x6fffe, 0x6ffff)
    exclude(0x7fffe, 0x7ffff)
    exclude(0x8fffe, 0x8ffff)
    exclude(0x9fffe, 0x9ffff)
    exclude(0xafffe, 0xaffff)
    exclude(0xbfffe, 0xbffff)
    exclude(0xcfffe, 0xcffff)
    exclude(0xdfffe, 0xdffff)
    exclude(0xefffe, 0xeffff)
    exclude(0xffffe, 0xfffff)
    exclude(0x10fffe, 0x10ffff)

proc parse*(code: string): Term =
  var stack = newSeqOfCap[Term](32)
    # TODO: how big does this stack grow?
  let match = parser.match(code, stack)
  if not match.ok:
    raise newException(ValueError, "parse did not match")
  assert(stack.len == 1, "parser did not backtrack during match")
  stack.pop()

when isMainModule:
  let str = newFileStream(stdin)
  echo str.readAll.parse

M tests/test1.nim => tests/test1.nim +30 -1
@@ 1,4 1,6 @@
import dhall/binary, dhall/render
import dhall/binary, dhall/parse, dhall/render

import cbor

import os, strutils, unittest



@@ 9,6 11,33 @@ iterator dhallTests(testDir, suffix: string): string =
      testBase.setLen(testBase.len - suffix.len)
      yield testBase

suite "parser":
  block success:
    let testDir = "dhall-lang/tests/parser/success"
    for testBase in dhallTests(testDir, "A.dhall"):
      test "success" / testBase:
        let
          code = readFile(testDir / testBase & "A.dhall")
          cbor = readFile(testDir / testBase & "B.dhallb")
          diag = readFile(testDir / testBase & "B.diag")
        checkpoint(code)
        let term = parse(code)
        checkpoint($term)
        let test = encode term
        if cbor != test:
          block:
            let test = $parseCbor(test)
            check(diag == test)
          check(cbor.toHex == test.toHex)
  block failure:
    let testDir = "dhall-lang/tests/parser/failure"
    for testBase in dhallTests(testDir, ".dhall"):
      test "failure" / testBase:
        let code = readFile(testDir / testBase & ".dhall")
        checkpoint(code)
        expect ValueError:
          discard parse(code)

suite "binary-decode":
  block success:
    let testDir = "dhall-lang/tests/binary-decode/success"