~quf/Earley.jl

3886dd4210181a52719c3289af0e082533878cdf — Lukas Himbert 1 year, 6 months ago trunk v1.0.0
initial public version
A  => .builds/archlinux.yml +19 -0
@@ 1,19 @@
# build manifest for sr.ht
image: archlinux
packages:
 - gnupg
 - julia
sources:
 - https://git.sr.ht/~quf/Earley.jl
tasks:
 - run-tests-current: |
    cd Earley.jl
    julia --project=. -e 'import Pkg; Pkg.test()'
 - download-lts: |
    gpg --import - < Earley.jl/.builds/juliareleases.asc
    curl -o julia-1.6.7-linux-x86_64.tar.gz 'https://julialang-s3.julialang.org/bin/linux/x64/1.6/julia-1.6.7-linux-x86_64.tar.gz'
    curl 'https://julialang-s3.julialang.org/bin/linux/x64/1.6/julia-1.6.7-linux-x86_64.tar.gz.asc' | gpg --verify --trust-model always - julia-1.6.7-linux-x86_64.tar.gz
    tar xzf julia-1.6.7-linux-x86_64.tar.gz
 - run-tests-lts: |
    cd Earley.jl
    $HOME/julia-1.6.7/bin/julia --project=. -e 'import Pkg; Pkg.update(); Pkg.test()'

A  => .builds/juliareleases.asc +52 -0
@@ 1,52 @@
-----BEGIN PGP PUBLIC KEY BLOCK-----
Version: GnuPG v1

mQINBFXxFlcBEADQDEBFlzoyehPuk13Ct928WwBvb0q9OKyjz2NlYq3sL5ReTbQB
9P5hyl68q5iJ6QTjKEaxr+Kmjhib9dQGZhtBXRa9q185Fdav48rS9rDKR5/aPXNi
4aA0BSp7fHIDrTUGOUMB5TFpVZil+Sz4llpPKDlgG70dn3ZLBznJQKUXJWhxrheG
ogUK4W3WAdBBPDVraPjBjvTTSrhoOBJh/oNib3J6xTIaUMhOFz+Vuq05BZI9UO6n
OsE3dSW7X7dvqjcN3Ti7TgbJD5d4iOsQl8NhqItyS8ZULV8TPGOuwitoWxqgFIAL
5bhM9Of4xOE0+rmgke1dKmMkq3cu6yCEFypqyxwShexe+1Mvx4Tn4/OqC7wFVpTA
IH2ys7NsVcoLtZGqlBQnbXFmIu9ay51Zb4wwbJ5Qr9Rfx5xPvJoOVUpP/0I8+vlI
CmBkP6vs9vMCCKcreP0FpjCTSRApv9IXuwjumOMb6P0GJPOuFVfsy4849ONPC/yM
dMbeopi/BWfHu/Nqt7pqY210jncsdBPlPy7LvvhIkbpeZHQDoQVDPX88ZylhqKTy
gpWPBT5ezJ5ib0nSvYIZjMOMlMWxDaNDBGZlyHizVFwLZk6qHWM7I2WbJGvNgBTv
0dX9jBIDhdKdSZjc3wxh+nqZQg1l8xOOx9yCLSiBL1OHf4PYqJudL09AUwARAQAB
tDNKdWxpYSAoQmluYXJ5IHNpZ25pbmcga2V5KSA8YnVpbGRib3RAanVsaWFsYW5n
Lm9yZz6JAjgEEwECACIFAlXxFlcCGwMGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheA
AAoJEGbjx9wD1uSVg78QAJZUeygDHj1zTxt+8UAm4TMu0nWmcPjSzTGj5Wt4Gtec
HlWsXTOvFbABv8r3vzD2W1Bi0D0UcUucBy3Jf0nrUBWY89VTREcG/EWsF2SwSB7H
cL3pu+vcdLiVtRGI4AiSoZz2CXc4vHY0X/3TlPejcO0UU8A0Ukth/cX1ZqCjKP8T
ciXy89X4mlRAsAXapkHxiO+bscTd/VdWaPaUx8/TxeFoPZFB/0FIeJHYbI1chKPd
vAtFYLpB89d8zbQYgISM6oc/f1j0CQR6JdHGoAGP9Wd8wRz+mDT3WzOqL4jXctcA
CQUKGgYkOW8OEFBlfUACZK5uFxWMktN8//IlzczCTbYb9Z89UeeF7oaXfSZMFwiF
kxseUGCceXb5Kqj3fZKmmUstAEzycyNuCeXG1KXyAz1mg/ihq/rzB11vQQjY4WYJ
rIoUecRN3btSex6jcdOxAIOeGcyfigT7NMgplFXXkbuux2N7qtOkLUNx80DMOggK
tnSP60GkO1xzJLi3EHtaDVPU59KpeXjyEsNB2ngc5+LwHwbYGvaaZaFXFm7oCmM7
xG88EU14mCLZbpGleD6cmpVAprFSIXV0Z0xm6pdH9XBCT4UJ8tFXTrJsc1dYd+mw
eAwCYZ38e95kqrYrRbhjOOAKEtf3t4VnrsifbTfTVclUbsrSXVTQdHoiMlODc/WX
uQINBFXxFlcBEADNmFCh53NJ+8CQSzQda/efBX+H/SCj2b3vIYJXY2nR9h4IQ7UV
/AU5sUB/bpIN3nwwdcILYSm2oJGP8fZ8Zf46XliUOK8+yD8ApDg6okl3R1G+E9Qk
/EN49BCeXx9uT5vHpcHWkBvKmqmjUJ283i6q3QT5qzbkCGGUQ7SyhU1ywbjYIQi/
HLJpntqz44LrM+vfGUAa+CJld3DyzAm66KFSRbDU12XPE948MxUDQ1NgY9hJIlfm
ud/ShKakfQoEsLiTkUbEY7Vc19s2+aM3S1zeRfsatuayPuEUsnuz42wKWSdPNGyJ
TkLdWz46vSgN9wpe0OLoWxsuomaViRaNFDSK7Uo+AGjWcjFNlehFlW/ELji1JbS5
f5EAD1A1I2RJvLHyri3xFJtM9qbGiA3ZIfcVXq5RxAOehDPCcKzBS4w37D2vLBOQ
Xa+ExTJxwiCnMPuo7acsfkyleakAe82L/fAoVWdPcFSjq3KFvkpGpTlvvh2jwhoW
AgDGu77K9T1rHjj7t2GjuR71RVc4r0CP9iF3rAPmq/FapONW1Pz0aom7XLBZt8Zq
4wsPsGaAECmwi07bE6Vr9nqCeQb7XmjVucVJP+VXDpOJzt4J5zSzTCWGyj47/K7a
Rlz9KtYmY0s4sKnx3sjKpC8xMXaLgvSjudrQCZ/sohKRayKGAMI2p71GbQARAQAB
iQIfBBgBAgAJBQJV8RZXAhsMAAoJEGbjx9wD1uSV6+oP/3MCyMWEBiu73HVI2dS2
hDct/E9fDkpB6o/HEGhdNFTeeb/L7GqcQACJDtBDNVtMu0WhCgKeteHXM0KMy55f
6HAQEVnWhGSyR4KksV93RPZvUO+zzX5M7F2LiI59MSruKAYTC0kXbjcu9aQAn+kJ
EPHiHwsTzRkWh90q54/B2NQ6oVAHgnMIeh32OBdFMNHOnP+n1zu/+Wd4miC3fR9V
tmsVrOS8WtozdEC6TmquYswQ/gT6c0afCZSlNF/ZPPrXGGdD6t9WTJntfYB1rbEk
E/9WpaUgpKpxXQEOMzMAm+2yBoYnCpXzvbY6fzNWfOg6DJ65t0rkrCwDRHLH1grA
61OQb0Ou8LQnrFGox8L394sFebIoaBUk2Vhw5LH78X6g1f7Mj6j9Er0YSabVVpHh
ncMYflOeswrV4C1oP5UvL7K3qtCixUU4LQ4XqmioQey8AnrCdJ7S5QeyP1n5vU3e
Nz1JHCcH4/e698CuIoCZa86Edmo3S0O2hhiC5qslf5u1pdndlmbrgsWpBH5kJ7mI
edeA2ND/KrLlllE7NImLdlrciShctFP1ciqqHtTebQ+5MH17ObOhSptUDEt5LjZt
3YXZtQ+C/UmfkC+QVUdWTQ4cWUCNtuzLP+PW3o1AQHmijWbaECq5yMRVlr7JuxPr
Lr+fAJHZvbYCQjMTkZYScgYU
=XN/B
-----END PGP PUBLIC KEY BLOCK-----

A  => .gitignore +1 -0
@@ 1,1 @@
Manifest.toml

A  => LICENSE.txt +287 -0
@@ 1,287 @@
                      EUROPEAN UNION PUBLIC LICENCE v. 1.2
                      EUPL © the European Union 2007, 2016

This European Union Public Licence (the ‘EUPL’) applies to the Work (as defined
below) which is provided under the terms of this Licence. Any use of the Work,
other than as authorised under this Licence is prohibited (to the extent such
use is covered by a right of the copyright holder of the Work).

The Work is provided under the terms of this Licence when the Licensor (as
defined below) has placed the following notice immediately following the
copyright notice for the Work:

        Licensed under the EUPL

or has expressed by any other means his willingness to license under the EUPL.

1. Definitions

In this Licence, the following terms have the following meaning:

- ‘The Licence’: this Licence.

- ‘The Original Work’: the work or software distributed or communicated by the
  Licensor under this Licence, available as Source Code and also as Executable
  Code as the case may be.

- ‘Derivative Works’: the works or software that could be created by the
  Licensee, based upon the Original Work or modifications thereof. This Licence
  does not define the extent of modification or dependence on the Original Work
  required in order to classify a work as a Derivative Work; this extent is
  determined by copyright law applicable in the country mentioned in Article 15.

- ‘The Work’: the Original Work or its Derivative Works.

- ‘The Source Code’: the human-readable form of the Work which is the most
  convenient for people to study and modify.

- ‘The Executable Code’: any code which has generally been compiled and which is
  meant to be interpreted by a computer as a program.

- ‘The Licensor’: the natural or legal person that distributes or communicates
  the Work under the Licence.

- ‘Contributor(s)’: any natural or legal person who modifies the Work under the
  Licence, or otherwise contributes to the creation of a Derivative Work.

- ‘The Licensee’ or ‘You’: any natural or legal person who makes any usage of
  the Work under the terms of the Licence.

- ‘Distribution’ or ‘Communication’: any act of selling, giving, lending,
  renting, distributing, communicating, transmitting, or otherwise making
  available, online or offline, copies of the Work or providing access to its
  essential functionalities at the disposal of any other natural or legal
  person.

2. Scope of the rights granted by the Licence

The Licensor hereby grants You a worldwide, royalty-free, non-exclusive,
sublicensable licence to do the following, for the duration of copyright vested
in the Original Work:

- use the Work in any circumstance and for all usage,
- reproduce the Work,
- modify the Work, and make Derivative Works based upon the Work,
- communicate to the public, including the right to make available or display
  the Work or copies thereof to the public and perform publicly, as the case may
  be, the Work,
- distribute the Work or copies thereof,
- lend and rent the Work or copies thereof,
- sublicense rights in the Work or copies thereof.

Those rights can be exercised on any media, supports and formats, whether now
known or later invented, as far as the applicable law permits so.

In the countries where moral rights apply, the Licensor waives his right to
exercise his moral right to the extent allowed by law in order to make effective
the licence of the economic rights here above listed.

The Licensor grants to the Licensee royalty-free, non-exclusive usage rights to
any patents held by the Licensor, to the extent necessary to make use of the
rights granted on the Work under this Licence.

3. Communication of the Source Code

The Licensor may provide the Work either in its Source Code form, or as
Executable Code. If the Work is provided as Executable Code, the Licensor
provides in addition a machine-readable copy of the Source Code of the Work
along with each copy of the Work that the Licensor distributes or indicates, in
a notice following the copyright notice attached to the Work, a repository where
the Source Code is easily and freely accessible for as long as the Licensor
continues to distribute or communicate the Work.

4. Limitations on copyright

Nothing in this Licence is intended to deprive the Licensee of the benefits from
any exception or limitation to the exclusive rights of the rights owners in the
Work, of the exhaustion of those rights or of other applicable limitations
thereto.

5. Obligations of the Licensee

The grant of the rights mentioned above is subject to some restrictions and
obligations imposed on the Licensee. Those obligations are the following:

Attribution right: The Licensee shall keep intact all copyright, patent or
trademarks notices and all notices that refer to the Licence and to the
disclaimer of warranties. The Licensee must include a copy of such notices and a
copy of the Licence with every copy of the Work he/she distributes or
communicates. The Licensee must cause any Derivative Work to carry prominent
notices stating that the Work has been modified and the date of modification.

Copyleft clause: If the Licensee distributes or communicates copies of the
Original Works or Derivative Works, this Distribution or Communication will be
done under the terms of this Licence or of a later version of this Licence
unless the Original Work is expressly distributed only under this version of the
Licence — for example by communicating ‘EUPL v. 1.2 only’. The Licensee
(becoming Licensor) cannot offer or impose any additional terms or conditions on
the Work or Derivative Work that alter or restrict the terms of the Licence.

Compatibility clause: If the Licensee Distributes or Communicates Derivative
Works or copies thereof based upon both the Work and another work licensed under
a Compatible Licence, this Distribution or Communication can be done under the
terms of this Compatible Licence. For the sake of this clause, ‘Compatible
Licence’ refers to the licences listed in the appendix attached to this Licence.
Should the Licensee's obligations under the Compatible Licence conflict with
his/her obligations under this Licence, the obligations of the Compatible
Licence shall prevail.

Provision of Source Code: When distributing or communicating copies of the Work,
the Licensee will provide a machine-readable copy of the Source Code or indicate
a repository where this Source will be easily and freely available for as long
as the Licensee continues to distribute or communicate the Work.

Legal Protection: This Licence does not grant permission to use the trade names,
trademarks, service marks, or names of the Licensor, except as required for
reasonable and customary use in describing the origin of the Work and
reproducing the content of the copyright notice.

6. Chain of Authorship

The original Licensor warrants that the copyright in the Original Work granted
hereunder is owned by him/her or licensed to him/her and that he/she has the
power and authority to grant the Licence.

Each Contributor warrants that the copyright in the modifications he/she brings
to the Work are owned by him/her or licensed to him/her and that he/she has the
power and authority to grant the Licence.

Each time You accept the Licence, the original Licensor and subsequent
Contributors grant You a licence to their contributions to the Work, under the
terms of this Licence.

7. Disclaimer of Warranty

The Work is a work in progress, which is continuously improved by numerous
Contributors. It is not a finished work and may therefore contain defects or
‘bugs’ inherent to this type of development.

For the above reason, the Work is provided under the Licence on an ‘as is’ basis
and without warranties of any kind concerning the Work, including without
limitation merchantability, fitness for a particular purpose, absence of defects
or errors, accuracy, non-infringement of intellectual property rights other than
copyright as stated in Article 6 of this Licence.

This disclaimer of warranty is an essential part of the Licence and a condition
for the grant of any rights to the Work.

8. Disclaimer of Liability

Except in the cases of wilful misconduct or damages directly caused to natural
persons, the Licensor will in no event be liable for any direct or indirect,
material or moral, damages of any kind, arising out of the Licence or of the use
of the Work, including without limitation, damages for loss of goodwill, work
stoppage, computer failure or malfunction, loss of data or any commercial
damage, even if the Licensor has been advised of the possibility of such damage.
However, the Licensor will be liable under statutory product liability laws as
far such laws apply to the Work.

9. Additional agreements

While distributing the Work, You may choose to conclude an additional agreement,
defining obligations or services consistent with this Licence. However, if
accepting obligations, You may act only on your own behalf and on your sole
responsibility, not on behalf of the original Licensor or any other Contributor,
and only if You agree to indemnify, defend, and hold each Contributor harmless
for any liability incurred by, or claims asserted against such Contributor by
the fact You have accepted any warranty or additional liability.

10. Acceptance of the Licence

The provisions of this Licence can be accepted by clicking on an icon ‘I agree’
placed under the bottom of a window displaying the text of this Licence or by
affirming consent in any other similar way, in accordance with the rules of
applicable law. Clicking on that icon indicates your clear and irrevocable
acceptance of this Licence and all of its terms and conditions.

Similarly, you irrevocably accept this Licence and all of its terms and
conditions by exercising any rights granted to You by Article 2 of this Licence,
such as the use of the Work, the creation by You of a Derivative Work or the
Distribution or Communication by You of the Work or copies thereof.

11. Information to the public

In case of any Distribution or Communication of the Work by means of electronic
communication by You (for example, by offering to download the Work from a
remote location) the distribution channel or media (for example, a website) must
at least provide to the public the information requested by the applicable law
regarding the Licensor, the Licence and the way it may be accessible, concluded,
stored and reproduced by the Licensee.

12. Termination of the Licence

The Licence and the rights granted hereunder will terminate automatically upon
any breach by the Licensee of the terms of the Licence.

Such a termination will not terminate the licences of any person who has
received the Work from the Licensee under the Licence, provided such persons
remain in full compliance with the Licence.

13. Miscellaneous

Without prejudice of Article 9 above, the Licence represents the complete
agreement between the Parties as to the Work.

If any provision of the Licence is invalid or unenforceable under applicable
law, this will not affect the validity or enforceability of the Licence as a
whole. Such provision will be construed or reformed so as necessary to make it
valid and enforceable.

The European Commission may publish other linguistic versions or new versions of
this Licence or updated versions of the Appendix, so far this is required and
reasonable, without reducing the scope of the rights granted by the Licence. New
versions of the Licence will be published with a unique version number.

All linguistic versions of this Licence, approved by the European Commission,
have identical value. Parties can take advantage of the linguistic version of
their choice.

14. Jurisdiction

Without prejudice to specific agreement between parties,

- any litigation resulting from the interpretation of this License, arising
  between the European Union institutions, bodies, offices or agencies, as a
  Licensor, and any Licensee, will be subject to the jurisdiction of the Court
  of Justice of the European Union, as laid down in article 272 of the Treaty on
  the Functioning of the European Union,

- any litigation arising between other parties and resulting from the
  interpretation of this License, will be subject to the exclusive jurisdiction
  of the competent court where the Licensor resides or conducts its primary
  business.

15. Applicable Law

Without prejudice to specific agreement between parties,

- this Licence shall be governed by the law of the European Union Member State
  where the Licensor has his seat, resides or has his registered office,

- this licence shall be governed by Belgian law if the Licensor has no seat,
  residence or registered office inside a European Union Member State.

Appendix

‘Compatible Licences’ according to Article 5 EUPL are:

- GNU General Public License (GPL) v. 2, v. 3
- GNU Affero General Public License (AGPL) v. 3
- Open Software License (OSL) v. 2.1, v. 3.0
- Eclipse Public License (EPL) v. 1.0
- CeCILL v. 2.0, v. 2.1
- Mozilla Public Licence (MPL) v. 2
- GNU Lesser General Public Licence (LGPL) v. 2.1, v. 3
- Creative Commons Attribution-ShareAlike v. 3.0 Unported (CC BY-SA 3.0) for
  works other than software
- European Union Public Licence (EUPL) v. 1.1, v. 1.2
- Québec Free and Open-Source Licence — Reciprocity (LiLiQ-R) or Strong
  Reciprocity (LiLiQ-R+).

The European Commission may update this Appendix to later versions of the above
licences without producing a new version of the EUPL, as long as they provide
the rights granted in Article 2 of this Licence and protect the covered Source
Code from exclusive appropriation.

All other changes or additions to this Appendix require the production of a new
EUPL version.

A  => Project.toml +12 -0
@@ 1,12 @@
name = "Earley"
uuid = "98d8810e-6867-4d91-bca0-8798dfcfe9b1"
authors = ["Lukas Himbert <lukas@2.71828.eu>"]
version = "1.0.0"

[deps]
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[compat]
julia = "1.0"
DataStructures = "0.18"

A  => README.md +152 -0
@@ 1,152 @@
Earley.jl
=========

Parse context-free languages with Earley's algorithm.

Examples
--------

```julia
julia> grammar = Grammar([ # Decimal numbers with an optional sign
           (:Number   => [:Unsigned],                     identity),
           (:Number   => [Match.OneOf("+-"), :Unsigned],  (c, n) -> (c == '+') ? n : -n),
           (:Unsigned => [Match.Digit()],                 d      -> d - '0'),
           (:Unsigned => [:Unsigned, Match.Digit()],      (n, d) -> 10n + (d-'0')),
       ]);

julia> parse(grammar, "-12")
-12
```

```julia
julia> grammar = Grammar([ # s-expressions
           (:sexpr  => [:par_open, :values, :par_close],  (_, values, _)   -> tuple(values...)),
           (:value  => [:identifier],                     identity),
           (:value =>  [:sexpr],                          identity),
           (:values => [],                                ()               -> []),
           (:values => [:ws, :values, :ws, :value, :ws],  (_, vs, _, v, _) -> push!(vs, v)),
           (:identifier => [Match.Letter()],              c                -> string(c)),
           (:identifier => [:identifier, Match.Letter()], (i, c)           -> i * c),
           (:par_open => [:ws, '(', :ws],                 (_, _, _)        -> nothing),
           (:par_close => [:ws, ')', :ws],                (_, _, _)        -> nothing),
           (:ws => [],                                    ()               -> nothing),
           (:ws => [Match.Space(), :ws],                  (_, _)           -> nothing),
       ]);

julia> parse(grammar, "(abc (def ghi) (j))")
("abc", ("def", "ghi"), ("j",))
```

```julia
julia> g = Grammar([ # A simple arithmetic grammar with mixed associativity.
           (:expression => [:sum],      identity),
           (:expression => [:product],  identity),
           (:sum => [:sum, '+', :product], (e1,_,e2) -> Expr(:call, :+, e1, e2)),
           (:sum => [:sum, '-', :product], (e1,_,e2) -> Expr(:call, :-, e1, e2)),
           (:sum => [:product],                   identity),
           (:product => [:factor],                identity),
           (:product => [:product, '*', :factor], (e1,_,e2) -> Expr(:call, :*, e1, e2)),
           (:factor => [:number], identity),
           (:factor => [:power],  identity),
           (:factor => ['(', :expression, ')'], (_,e,_) -> e),
           (:number => [Match.Digit()], c -> c-'0'),
           (:power => [:factor, '^', :factor], (e1,_,e2) -> Expr(:call, :^, e1, e2)),
       ]);

julia> parse(g, "1+2-3+4")
:(((1 + 2) - 3) + 4)

julia> parse(g, "2*3^4^(5+6)*7")
:((2 * 3 ^ (4 ^ (5 + 6))) * 7)

julia> parse(g, "1-2*3^4+5")
:((1 - 2 * 3 ^ 4) + 5)
```

```julia
julia> grammar = CFG([ # An even number of 'a' characters
           :A => [:A, :A],
           :A => ['a', 'a']
           :A => [],
       ]);

julia> recognize(grammar, "aaa")
false

julia> recognize(grammar, "aaaa")
true
```


Overview
--------

This package provides the following:

* `CFG`, a datatype for modeling context-free grammars.

* `Grammar`, a datatype for modeling context-free grammars and semantic actions associated with each production rule; i.e. a grammar with synthesized attributes.

* `recognize(grammar, input)`, a function that can tell for any grammar and any input, whether the input belongs to the language defined by the grammar.

* `parse(grammar, input)`, a function that can parse a given input and return either a parse tree, or a value computed through semantic actions.

* `matches`, a function that matches input tokens against terminals listed in the production rules.

* `Matches`, various predefined token classes.

For detailed information, see the respective Julia docstrings.


Compatibility
-------------

`Earley.jl` follows [semanting versioning v2.0.0](https://semver.org/).
The current version is 1.0.0.

This package works with Julia version 1.6.7 (the current LTS) and above.
It should also work for Julia version 1.0.

It depends on the [DataStructures](https://github.com/JuliaCollections/DataStructures.jl) package.


Releases
--------

### Version 1.0.0

Initial public version


Bugs, Caveats, TODO
-------------------

- Performance has only been a minor consideration during the development of this package.
  Some of the included algorithms have asymptotically faster alternatives which are not implemented here.

- There is no support for repeated or optional terms, such as `A ::= 'a' *` from EBNF.
  It's up to the user to translate constructs such as this into the form required for recognition/parsing.

- The parser does not support cyclic grammars.
  (The recognizer does.)
  It seems feasible to add support for cyclic grammars in principle, but it would require a lot of effort and the payoff would be questionable.

- In the case of ambiguous languages, the parser can only return one parse tree, not the whole parse forest.

- There is no support for reporting partial parses or likely fixes in the case of minor syntax errors.

- Error messages for incorrect grammars may be hard to decipher.


See also
--------

A [tutorial by Loup Vaillant](https://loup-vaillant.fr/tutorials/earley-parsing/) has been helpful to the author in understanding the principles of Earley parsing.


Copyright
---------

Ⓒ Lukas Himbert 2022

Licensed under the [EUPL-1.2-or-later](https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12).

A  => src/Earley.jl +752 -0
@@ 1,752 @@
"""`Earley`

`Earley` contains a recognizer and parser for context-free grammars.

Context-free grammars are modeled using the `CFG` datatype (preferred for
the recognizer) or the `Grammar` datatype (preferred for the parser). The
`Grammar` type includes a semantic action for each production rule of the
grammar.

The module provides the following functionality:

* `recognize(grammar, input)` checks if the `input` matches the language
  described by the `grammar`.

* `parse(grammar :: Grammar, input)` parses the `input` and computes the
  semantic actions on the parse tree.

* `parse(grammar :: CFG, input)` parses the `input` and returns a `ParseTree`.

* `matches(class, token)` is used by the algorithm to match a token (in the
  input) against a terminal/token class (in the right-hand side of a
  production rule). Some terminals are predefined in the `Match` submodule.

`recognize` supports all context-free grammars.

`parse` supports most practical grammars, including those with left-recursion,
right-recursion, mixed recursion, and ambiguity. However, cyclic grammars are
not supported as these may generate an infinite parse tree for a finite input.
"""
module Earley

using DataStructures: Stack
using DataStructures: Queue, enqueue!, dequeue!
using DataStructures: OrderedSet

include("Match.jl")
using .Match

import Base.parse

export CFG
export Grammar
export matches
export Match
export recognize



### CFG datatype

"""`CFG{T}(rules[, start_symbol])`

A context-free grammar with terminals of type `T` (usually `Any`). If no
`start_symbol` is given, the lhs of the first rule in `rules` is assumed.

`rules` is an array of production rules in the form of tuples `(A => rhs)`,
where `A` is a nonterminal (a `Symbol`) and `rhs` is an array of terminals (of
any type) and nonterminals.

Examples
========

```
CFG{Char}([
  :A => ['a'],
  :A => [:A, :A]
]);
```
is an ambiguous grammar that matches any (nonzero) number of 'a' characters.

```
CFG{String}([
  :Block => ["{}"],
  :Block => ["if()", :Block],
  :Block => ["if()", :Block, "else", :Block],
]);
```
is a grammar that matches a trivial C-like language with "dangling else"
ambiguity. Unlike the previous example, input tokens are strings rather than
Chars.

```
CFG([
  :Number => [Match.Digit()],
  :Number => [Match.Digit(), :Number]
]);
```
matches an unsigned decimal number. The paramater `T` is assumed to be
`Any`.
"""
struct CFG{T}
  rules :: Vector{Pair{Symbol,Vector{Union{Symbol,T}}}}
  start_symbol :: Symbol
  function CFG{T}(rules, start_symbol) where {T}
    return new{T}([lhs => collect(Union{Symbol,T}, rhs) for (lhs, rhs) in rules], start_symbol)
  end
end

CFG{T}(rules) where {T} = CFG{T}(rules, first(rules)[1])

# Assume tokens to be of type Any unless requested otherwise.
CFG(rules) = CFG{Any}(rules, first(rules)[1])
CFG(rules, start_symbol) = CFG{Any}(rules, start_symbol)



### Grammar datatype

"""`Grammar{T}(rules_and_actions[; start_symbol])`

A context-free grammar with terminals of type `T` (usually `Any`), as well as
a list of semantic actions (i.e. functions). `start_symbol` is an optional
argument specifying the start symbol (that needs to match the whole input). If
no value is supplied, the left-hand side of the first production rule is
assumed.

The list of rules and actions should be an iterable of tuples
`(lhs => rhs, action)`, where `lhs => rhs` is a production rule of a context
free grammar as described in the documentation for `CFG`. In short, `lhs`
should be a `Symbol` (nonterminal) and `rhs` should be an array of `Symbol`s
and terminals (which may be compared to the input using `matches`). The
`action` should be a function taking one argument for each terms on the
`rhs` of the rule and returning a single value.

In formal terms, `Grammar` describes an attribute grammar with synthesized
attributes, and the semantic actions are closures which define the attributes.

Example
=======

```
julia> g_num = Grammar([
  (:Number   => [:Unsigned],                 n      -> n),
  (:Number   => ['+', :Unsigned],            (_, n) -> n),
  (:Number   => ['-', :Unsigned],            (_, n) -> -n),
  (:Unsigned => [Match.Digit()],             d      -> (d - '0')),
  (:Unsigned => [:Unsigned, Match.Digit()],  (n, d) -> 10n + (d - '0'))
]);
```
matches a decimal integer (with optional sign) and computes its value.
"""
struct Grammar{T}
  cfg :: CFG{T}
  actions :: Vector{Function}

  function Grammar{T}(g, start_symbol::Symbol) where {T}
    new{T}(CFG{T}((rule for (rule, _) in g), start_symbol), Function[action for (_, action) in g])
  end
end

function Grammar{T}(g) where {T}
  ((start_symbol, _), _) = first(g)
  return Grammar{T}(g, start_symbol)
end

Grammar(g) = Grammar{Any}(g)
Grammar(g, s::Symbol) = Grammar{Any}(g, s)



### Earley Item datatype

"""`Item(start, rule, dot)`

Earley parser item, i.e. a compact representation of a partial parse.

* `start` is the position in the token stream where the partial parse begins,
  i.e. `start` is one if the parse starts at the first input token, two if it
  starts at the second input token, etc.
* `rule` is the index of the rule being parsed in the list of rules, i.e. one
  for the first rule, two for the second rule, etc.
* `dot` is the position of the 'dot' (in the common abstract notation of the
  algorithm), i.e. which sub-parse we're expecting next. The value is one if
  we need to try the first sub-parse, two if we need to try the second
  sub-parse, and equal to one plus the number of terms on the right-hand side
  if the sub-parse has been completed successfully.
"""
struct Item
  start :: Int
  rule :: Int
  dot :: Int
end


### recognizer

"""`recognize(grammar::CFG, input)`

Returns true if the input matches the given `grammar`, false otherwise. The
`input` shall be an array of tokens (e.g. Char, UInt8, String, …).

Terminal input tokens are matched against token classes using `match`. The
default implemantation compares equality (i.e. if a token is present in the
right hand side of a production rule, that token must be matched in the input
for the rule to match), but `match` can be overloaded to create custom token
classes for convenience. For predefined classes, see `Earley.Match`.

There is no restriction on the `grammar`, except that it must be context-free.

Examples
--------

```
julia> recognize(
  CFG([
    :Par => ['(', :Par, ')'],
    :Par => []
    ]),
  "(()))"
  )
false
```

```
julia> recognize(
  CFG([
    :Num => ['0', 'x', :Digits],
    :Digits => [Match.HexDigit()],
    :Digits => [Match.HexDigit(), :Digits],
    ]),
  "0xc0ffee"
  )
true
```
"""
function recognize(grammar :: CFG, input) :: Bool
  successful(item) = let
    (lhs, rhs) = grammar.rules[item.rule]
    return item.start == 1 && lhs == grammar.start_symbol && item.dot > length(rhs)
    end
  return any(successful, last(chart(grammar :: CFG, input)))
end


"""`recognize(grammar :: Grammar, input)`

Returns true if the input matches the given grammar, false otherwise. Semantic
actions associated with the production rules are ignored.
"""
function recognize(grammar :: Grammar, input) :: Bool
  recognize(grammar.cfg, input)
end


"""`chart(grammar :: CFG, input)`

Core implementation of the Earley algorithm.

Computes an Earley chart (list of attempted partial parses). This works for
any context-free `grammar` (as described in `CFG`) and `input`. `input` may be
an arbitrary iterator.
"""
function chart(grammar :: CFG, input)
  # The array `items` holds the partial parses in form of Earley items (struct `Item`).
  # The first entry is an array with all (attempted) complete parses,
  # the second entry is an array with all sub-parses after the first token has been consumed,
  # the third entry is an array with all sub-parses after the second token has been consumed,
  # and so on.
  items = [OrderedSet{Item}() for _ in 1:length(input)+1]

  null = nullables(grammar)

  # We start parsing from the start symbol
  for (i, (lhs, _)) in enumerate(grammar.rules)
    if lhs == grammar.start_symbol
      push!(items[1], Item(1, i, 1))
    end
  end

  # Continue all possible partial parses, one token at a time
  for (i, token) in enumerate(input)
    for item in items[i]
      (lhs, rhs) = grammar.rules[item.rule]
      if item.dot > length(rhs)
        complete!(items, i, grammar, item)
      else
        if rhs[item.dot] isa Symbol
          predict!(items, i, grammar, item, null)
        elseif matches(rhs[item.dot], token)
          scan!(items, i, grammar, item)
        else
          # partial parse failed, can't continue
        end
      end
    end
  end

  # final predict/complete after input has been consumed
  for item in last(items)
    (lhs, rhs) = grammar.rules[item.rule]
    if item.dot > length(rhs)
      complete!(items, length(items), grammar, item)
    elseif rhs[item.dot] isa Symbol
      predict!(items, length(items), grammar, item, null)
    else
      # we're stuck
    end
  end

  return items
end

function complete!(items, current, grammar, item)
  (completed_term, completed_term_rhs) = grammar.rules[item.rule]
  @assert item.dot > length(completed_term_rhs)

  for it in items[item.start]
    (lhs, rhs) = grammar.rules[it.rule]
    if it.dot <= length(rhs) && rhs[it.dot] == completed_term
      push!(items[current], Item(it.start, it.rule, it.dot+1))
    end
  end
end

function predict!(items, current, grammar, item, nullableset)
  (lhs, rhs) = grammar.rules[item.rule]
  @assert rhs[item.dot] isa Symbol

  if rhs[item.dot] in nullableset
    # we know that the current nonterminal can produce the empty string, so we can already advance the 'dot' over it.
    push!(items[current], Item(item.start, item.rule, item.dot+1))
  end

  for (i, (term, _)) in enumerate(grammar.rules)
    if term == rhs[item.dot]
      push!(items[current], Item(current, i, 1))
    end
  end
end

function scan!(items, current, grammar, item)
  push!(items[current+1], Item(item.start, item.rule, item.dot+1))
end


"""`nullables(grammar)`

Given a `grammar` (as described in `recognize`), return an array of nullable
nonterminals, i.e. nonterminals that can produce the empty string.
"""
function nullables(grammar :: CFG{T}) where {T}
  # TODO: currently this function has worst case time complexity O(length(grammar.rules)^3). A better technique is described in https://github.com/jeffreykegler/kollos/blob/master/notes/misc/loup2.md
  null = Set{Symbol}()
  # keep adding nullables until no further ones are found
  while true
    got_one = false
    for (lhs, rhs) in grammar.rules
      if (!(lhs in null)) && all(term -> term in null, rhs)
        push!(null, lhs)
        got_one = true
      end
    end
    if !got_one
      break
    end
  end
  return null
end

nullables(grammar :: Grammar) = nullables(grammar.cfg)



### Parser datatypes

"""`CompletedItem(start, stop, rule)`

Compact representation of a completed partial parse.

* `start` is the position in the token stream where the partial parse begins,
  i.e. `start` is one if the parse starts at the first token, etc.
* `stop` is the position in the token stream where the partial parse ends,
  i.e. `stop` is one if the parse stops before the first token, two if it
  stops after the first token, three if it stops after the second token, etc.
* `rule` is the index of the rule being parsed in the list of rules, i.e. one
  for the first rule, two for the second rule, etc.
  The value `0` may be used as a sentinel to mark a nonterminal where
  appropriate.
"""
struct CompletedItem
  start :: Int
  stop :: Int
  rule :: Int # May be `0` iff it corresponds to a terminal on the RHS of a production rule.
end


"""`ParseTree`

Data structure representing a parse tree, or a node therein. The following
members may be accessed:

- `rule`: Index of the rule corresponding to this node in the tree. Values 1
  through `length(rules)` are used for actual rules, `0` is used for
  matched terminals.
- `start`: Start index of the subset of the input represented by this node.
- `stop`: One plus the last index of the subset of the input represented by
  this node. If `stop` == `start`, the node matches an empty subset. If
  `rule == 0` (i.e. the node corresponds to a terminal), `stop = start+1`,
  and `start` is the index of the terminal in the input.
- `children :: Vector{ParseTree}`: A ParseTree (node) for each term on the
  right-hand side of the production `rule`, in order. For terminals, this
  will be empty.
"""
struct ParseTree
  rule :: Int # May be `0` iff it corresponds to a terminal on the RHS of a production rule.
  start :: Int
  stop :: Int
  children :: Vector{ParseTree}
end



### Parser

"""`process_chart_for_parser(grammar::CFG, chart)`

Processes the chart (list of partial parses) for later use by the parser. The
core Earley algorithm returns a list of partial parses, some of which are
complete, and some of which are not. The parser only needs completed partial
parses and also needs (or at least prefers) them indexed differently.

This function takes a list of lists of Earley items where the outer list
implicitely stores the _last_ token consumed and returns a list of lists of
Earley items where every item corresponds to a _completed_ partial parse and
the outer list implicitely stores the _first_ token consumed. Moreover, the
inner lists are sorted by rule, from smallest rule index to largest.
"""
function process_chart_for_parser(grammar :: CFG, chart)
  completed = [Vector{CompletedItem}() for _ in 1:length(chart)]
  for (stop, items) in enumerate(chart)
    for item in items
      (lhs, rhs) = grammar.rules[item.rule]
      if item.dot > length(rhs)
        push!(completed[item.start], CompletedItem(item.start, stop, item.rule))
      end
    end
  end
  for items in completed
    sort!(items; by=item->item.rule)
  end
  return completed
end

"""`parse(grammar::Grammar, input)`

Parse the given `input` according to the production rules and semantic actions
in `Grammar`, returning the result of the semantic action corresponding to the
rule that matches the whole input.

If the `grammar` is ambiguous, the parser will always use the first matching
rule to resolve the ambiguity.

Cyclic grammars are not supported as these can produce an infinite parse tree
for a finite input. Ambiguity, left-recursion and/or right-recursion do not
generally problem for the parser. All unambiguous context-free grammars are
supported.

Examples
========

```
julia> g_num = Grammar([
  (:Number   => [:Unsigned],                 n      -> n),
  (:Number   => ['+', :Unsigned],            (_, n) -> n),
  (:Number   => ['-', :Unsigned],            (_, n) -> -n),
  (:Unsigned => [Match.Digit()],             d      -> (d - '0')),
  (:Unsigned => [:Unsigned, Match.Digit()],  (n, d) -> 10n + (d - '0'))
]);
```
matches a decimal integer (with optional sign) and computes its value:
```
julia> parse(g_num, "42")
42

julia> parse(g_num, "-3141")
-3141
```

```
julia> g_num = Grammar{String}([
  (:Block => ["{}"],                           _         -> ()),
  (:Block => ["if()", :Block],                 (args...) -> args),
  (:Block => ["if()", :Block, "else", :Block], (args...) -> args),
]);
```
is a trivial C-like grammar with "dangling else" ambiguity. The input is
a list of `String`s rather than a string (of `Char`s):
```
julia> parse(g_num, ["if()", "if()", "{}", "else", "{}"])
("if()", ("if()", (), "else", ()))
```

```
julia> g = Grammar([
  (:A => [:A],  identity),
  (:A => ['a'], identity),
  ]);
```
is a cyclic grammar and is unsupported.
"""
function Base.parse(grammar :: Grammar, input)
  checkgrammar(grammar)
  completed_chart = process_chart_for_parser(grammar.cfg, chart(grammar.cfg, input))
  for item in first(completed_chart)
    if item.stop > length(input) && grammar.cfg.rules[item.rule][1] == grammar.cfg.start_symbol
      return do_parse(grammar, input, completed_chart, item)
    end
  end
  error("no parse")
end


"""`parse(grammar :: CFG, input)`

Parse a given `input`, returning a `ParseTree`.

The method `parse(::Grammar, input)` should be preferred to this one in most
cases. The same restrictions on the `grammar` apply to both methods.
"""
function Base.parse(grammar :: CFG, input)
  checkgrammar(grammar)
  completed_chart = process_chart_for_parser(grammar, chart(grammar, input))
  for item in first(completed_chart)
    if item.stop > length(input) && grammar.rules[item.rule][1] == grammar.start_symbol
      return do_parse(grammar, input, completed_chart, item)
    end
  end
  error("no parse")
end


function do_parse(grammar :: CFG, input, chart, item)
  if item.rule == 0
    return ParseTree(item.rule, item.start, item.stop, ParseTree[])
  else
    return ParseTree(item.rule, item.start, item.stop, [do_parse(grammar, input, chart, it) for it in decompose(grammar, input, chart, item)])
  end
end

function do_parse(grammar :: Grammar, input, chart, item)
  if item.rule == 0
    return input[item.start]
  else
    return grammar.actions[item.rule]((do_parse(grammar, input, chart, it) for it in decompose(grammar.cfg, input, chart, item))...)
  end
end


"""`finditems(grammar, chart, term, start, stop)`

Return an iterable of all valid parses of the nonterminal `term` starting at
`start` and ending before `stop`. `chart` must be a processed set of completed
earley items as returned by `process_chart_for_parser()`.

The items are sorted by rule index, then by length.
"""
function finditems(grammar, chart, term, start, stop)
  result = CompletedItem[]
  if start > stop
    return result
  end
  for item in chart[start]
    (lhs, _) = grammar.rules[item.rule]
    if item.stop ≤ stop && lhs == term
      push!(result, item)
    end
  end
  sort!(result, lt=(it1, it2)->it1.rule < it2.rule || it1.stop < it2.stop, alg=Base.Sort.DEFAULT_STABLE)
  return result
end


"""`decompose(grammar::CFG, input, chart, item)`

Given a completed Earley `item`, return a list of completed items, with each
element corresponding to the right-hand side of the `item.rule` in the
`grammar`.
"""
function decompose(grammar :: CFG, input, chart, item) :: Vector{CompletedItem}
  (lhs, rhs) = grammar.rules[item.rule]

  if isempty(rhs)
    return CompletedItem[]
  end

  # The workstack is a list of items to be considered for every term on the right-hand side of the production rule.
  # The bottom-most element is a list of possible items for the first rule on the rhs, the one above it is a list of possible items for the second rule on the rhs, etc.
  workstack = Stack{Queue{CompletedItem}}()

  # `items` is a list of possible completed items for every term on the rhs so far. Parses are matched greedily, one term at a time, until a match is definitely known to fail.
  # In that case, we back up one term and try the next candidates.
  items = Stack{CompletedItem}()

  # Populate the initial workstack
  if first(rhs) isa Symbol
    push!(workstack, Queue{CompletedItem}())
    for item in finditems(grammar, chart, first(rhs), item.start, item.stop)
      enqueue!(first(workstack), item)
    end
  elseif matches(first(rhs), input[item.start])
    push!(workstack, Queue{CompletedItem}())
    enqueue!(first(workstack), CompletedItem(item.start, item.start+1, 0))
  else
    error("This should never happen")
  end

  # Find appropriate items, one at a time.
  while !isempty(workstack)
    @assert length(workstack) == length(items) + 1
    if length(items) == length(rhs) && first(items).stop == item.stop
      return collect(Iterators.reverse(items))
    end
    if !isempty(first(workstack)) && length(items) < length(rhs)
      # try to match the next item
      it = dequeue!(first(workstack))
      if it.stop ≤ item.stop
        push!(items, it)
        next_candidates = Queue{CompletedItem}()
        push!(workstack, next_candidates)
        if length(items) < length(rhs)
          next_term = rhs[length(items)+1]
          if next_term isa Symbol
            # Matching nonterminal rules
            for i in finditems(grammar, chart, next_term, it.stop, item.stop)
              enqueue!(next_candidates, i)
            end
          elseif it.stop ≤ length(input) && matches(next_term, input[it.stop])
            # Matching  terminal
            enqueue!(next_candidates, CompletedItem(it.stop, it.stop+1, 0))
          end
        end
      end
    else
      # backtrack
      pop!(workstack)
      pop!(items)
    end
  end
  error("No solution - this should never happen")
end


### check grammar
"""`checkgrammar(grammar)`

Check if a given grammar can be parsed meaningfully. If the grammar is ok,
nothing will be returned and no action will be performed.

If any term appearing on the right-hand side of a production has no rule with
that term appearing on the left-hand side of a production, a warning will be
emitted.

If the grammar is cyclic (allows for a derivation A => A for any nonterminal
A, transitively), an `ErrorException` will be raised.
"""
function checkgrammar(grammar :: CFG)
  # TODO: check for symbols on the lhs that cannot be reached from the start symbol

  # Check for "undefined" nonterminals
  lhss = Set(lhs for (lhs, rhs) in grammar.rules)
  rhss = Set(term for (lhs, rhs) in grammar.rules for term in rhs if term isa Symbol)
  if !issubset(rhss, lhss)
    @warn "Symbol does not appear on the left-hand side of any production: $(first(setdiff(rhss, lhss)))"
  end

  if iscyclic(grammar)
    error("grammar is cyclic") # TODO: include the actual cycle in the error message
  end
end

checkgrammar(grammar :: Grammar) = checkgrammar(grammar.cfg)


"""`iscyclic(grammar)`

Returns true iff the grammar is cyclic, i.e. allows for derivation of A ⇒* A,
where A is a nonterminal and ⇒* means performing an arbitrary number of
replacements according to the production rules.
"""
function iscyclic(grammar :: CFG)
  ns = nullables(grammar) # TODO: avoid computing this twice
  isnullable(term) = term in ns
  # generate a list of replacement rules
  replacement_rules = Dict{Symbol,Set{Symbol}}()
  for (term1, rhs) in grammar.rules
    replacement_rules[term1] = get(replacement_rules, term1, Set{Symbol}())
    for (i, term2) in enumerate(rhs)
      if term2 isa Symbol && all(isnullable, rhs[1:i-1]) && all(isnullable, rhs[i+1:end])
        push!(replacement_rules[term1], term2)
      end
    end
  end
  # find cycles
  for term in keys(replacement_rules)
    visited = Set{Symbol}()
    next = replacement_rules[term]
    while !isempty(next)
      if term in next
        return true
      end
      union!(visited, next)
      next = setdiff(union((get(replacement_rules, t, Set{Symbol}()) for t in visited)...), visited)
    end
  end
  return false
end

iscyclic(grammar :: Grammar) = iscyclic(grammar.cfg)


### Debugging helper functions

"""`printchart(grammar, chart)`

Print an earley chart (list of partial parses) to stdout in a somewhat human-
readable format.
"""
function printchart(grammar, chart)
  printchart!(Base.stdout, grammar, chart)
  for (i, items) in enumerate(chart)
    println("(At token $i):")
    for item in items
      (lhs, rhs) = grammar.rules[item.rule]
      print(" [Rule $(item.rule)] $lhs ⇒ ")
      print(join(repr.(rhs[1:item.dot-1]), " "))
      print(" • ")
      print(join(repr.(rhs[item.dot:end]), " "))
      println(" (starting at $(item.start))")
    end
  end
end

"""`printparsetree(grammar, input, tree)`

Print a `ParseTree` to stdout in a somewhat readable form, with
each line corresponding to a node and indentation matching depth.
"""
function printparsetree(grammar, input, t; depth=0)
  # rule start stop
  if t.rule ≠ 0
    lhs, rhs = grammar.rules[t.rule]
    println(" "^depth * "$lhs ⇒ " * join(repr.(rhs), " ") * ", from $(t.start) to $(t.stop)")#input: $(input[t.start:t.stop])")
    for c in t.children
      printparsetree(grammar, input, c; depth=depth+1)
    end
  else
    println(" "^depth * "Terminal '$(input[t.start])' at $(t.start)")
    @assert isempty(t.children)
  end
end

end # module

A  => src/Match.jl +144 -0
@@ 1,144 @@
"""`Match`

This submodule contains several datatypes that represent certain classes of
tokens. Using these datatypes in the grammar instead of enumerating symbols is
convenient for all but the most basic grammars.

The following datatypes for unicode characters are available:

* `ASCII` - matches ASCII characters
* `Space` - matches Unicode whitespace
* `Digit` - matches a decimal digit (0 through 9)
* `HexDigit` - matches a hexadecimal digit (0 through 9 and 'a' through 'f')
* `Letter` - matches a Unicode letter
* `Lower` - matches a Unicode lower case character
* `Upper` - matches a Unicode upper case character
* `Print` - matches a Unicode printable character

Moreover, the following datatypes are available for all types of tokens:

* `AnyToken` - matches any token regardless of type or value
* `OneOf` - matches a token (class) from a predefined list
* `Predicate` - matches a token if that token fulfills a given predicate
"""
module Match

export OneOf
export ASCII, Space, Digit, Letter, Lower, Upper, Print, AnyToken
export matches

"""`matches(class, token)`

Returns true if `token` belongs to the given `class` of tokens. The default
implementation compares equality, i.e.
`matches(class, token) = token == class`.

For context, see also: `recognize`, `parse`. For a list of predefined token
classes, see `Match`.
"""
function matches(a, b)
  a == b
end

"""`OneOf(alternatives)`

Creates a token class that matches any one of the given alternatives. For
more information see `matches`.
"""
struct OneOf{T}
  alternatives :: Vector{T}
  OneOf(as) = OneOf(collect(as))
  OneOf(as :: Vector) = new{eltype(as)}(as)
end

function matches(as :: OneOf, b)
  any(a -> matches(a, b), as.alternatives)
end

"""`ASCII()`

Creates a token class that matches any ASCII character.
"""
struct ASCII end

matches(_ :: ASCII, token) = isascii(token)

"""`Space()`

Creates a token class that matches any whitespace token (i.e. one for which
`isspace` returns true).
"""
struct Space end

matches(_ :: Space, token) = isspace(token)

"""`Digit()`

Creates a token class that matches the digits '0' through '9'.
"""
struct Digit end

matches(_ :: Digit, token) = isdigit(token)

"""`Letter()`

Creates a token class that matches a unicode letter.
"""
struct Letter end

matches(_ :: Letter, token) = isletter(token)

"""`Lower()`

Creates a token class that matches a lower-case unicode letter.
"""
struct Lower end

matches(_ :: Lower, token) = islowercase(token)

"""`Upper()`

Creates a token class that matches an upper-case unicode letter.
"""
struct Upper end

matches(_ :: Upper, token) = isuppercase(token)

"""`Print()`

Creates a token class that matches a printable letter (including whitespace).
"""
struct Print end

matches(_ :: Print, token) = isprint(token)

"""`HexDigit()`

Creates a token class that matches a hexadecimal digit, i.e. the digits '0'
through '9' or latin letters 'a' through 'f' (in upper and lower case
variants).
"""
struct HexDigit end

matches(_ :: HexDigit, token) = isxdigit(token)

"""`AnyToken`

Creates a token class that matches any token.
"""
struct AnyToken end

matches(_ :: AnyToken, _) = true

"""`Predicate(p)`

Creates a token class that matches any token for which the given predicate `p`
is true.
"""
struct Predicate{P}
  p :: P
end

matches(pred :: Predicate, t) = pred.p(t)

end # Match

A  => test/json.jl +72 -0
@@ 1,72 @@
module JSON

using DataStructures
using Earley

# This is a JSON grammar translated from augmented BNF in RFC 4627.
# It should not be assumed to be correct and is only used to test the Earley parser.
grammar = Grammar([
  # JSON text
  (:JSON => [:Object], identity),
  (:JSON => [:Array],  identity),

  # Structural characters
  (Symbol("Begin Array")     => [:WS, '[', :WS], (_,_,_) -> nothing),
  (Symbol("Begin Object")    => [:WS, '{', :WS], (_,_,_) -> nothing),
  (Symbol("End Array")       => [:WS, ']', :WS], (_,_,_) -> nothing),
  (Symbol("End Object")      => [:WS, '}', :WS], (_,_,_) -> nothing),
  (Symbol("Name Separator")  => [:WS, ':', :WS], (_,_,_) -> nothing),
  (Symbol("Value Separator") => [:WS, ',', :WS], (_,_,_) -> nothing),
  (:WS => [],                                    ()      -> nothing),
  (:WS => [:WS, Match.Space()],                  (_,_)   -> nothing),

  # Values
  (:Value => [:Object],        identity),
  (:Value => [:Array],         identity),
  (:Value => [:Number],        identity),
  (:Value => [:String],        identity),
  (:Value => collect("false"), (_...) -> false),
  (:Value => collect("null"),  (_...) -> nothing),
  (:Value => collect("true"),  (_...) -> true),

  # Objects
  (:Object  => [Symbol("Begin Object"), :Members, Symbol("End Object")], (_, members, _) -> OrderedDict(members)),
  (:Members => [],                                             ()       -> Pair{Any,Any}[]),
  (:Members => [:Member],                                      m        -> Pair{Any,Any}[m]),
  (:Members => [:Members, Symbol("Value Separator"), :Member], (ms,_,m) -> vcat(ms, [m])),
  (:Member  => [:String, Symbol("Name Separator"), :Value],    (s,_,v) -> (s=>v)),

  # Arrays
  (:Array  => [Symbol("Begin Array"), :Values, Symbol("End Array")], (_,vs,_) -> vs),

  (:Values => [],                                           ()       -> Any[]),
  (:Values => [:Value],                                     (v)      -> Any[v]),
  (:Values => [:Values, Symbol("Value Separator"), :Value], (vs,_,v) -> vcat(vs, [v])),

  # Numbers
  (:Number => [Symbol("Optional Minus"), :Int, Symbol("Optional Fractional Part"), Symbol("Optional Exponent")], (s,x,f,e) -> s * parse(BigFloat, "$x.$(f)") * 10^e),
  (Symbol("Optional Minus") => [],    ()  -> 1),
  (Symbol("Optional Minus") => ['-'], (_) -> -1),
  (:Int => [Match.Digit()],                     (c)     -> string(c)),
  (:Int => [Match.OneOf("123456789"), :Digits], (c, ds) -> c * ds),
  (Symbol("Optional Fractional Part") => [],             ()     -> "0"),
  (Symbol("Optional Fractional Part") => ['.', :Digits], (_,ds) -> ds),
  (:Digits => [Match.Digit()],          (c)    -> string(c)),
  (:Digits => [Match.Digit(), :Digits], (c,ds) -> c * ds),
  (Symbol("Optional Exponent") => [],                                                    () -> 0),
  (Symbol("Optional Exponent") => [Match.OneOf("eE"), Symbol("Optional Sign"), :Digits], (_,s,ds) -> s * parse(BigFloat, ds)),
  (Symbol("Optional Sign") => [],    ()  -> 1),
  (Symbol("Optional Sign") => ['+'], (_) -> 1),
  (Symbol("Optional Sign") => ['-'], (_) -> -1),

  # Strings
  (:String  => ['"', :Chars, '"'], (_, s, _) -> s),
  (:Chars   => [],                 ()        -> ""),
  (:Chars   => [:Chars, :Char],    (cs,c)    -> cs * c),
  (:Char    => [Match.Predicate(c -> (0x20 ≤ Int(c) ≤ 0x21) || (0x23 ≤ Int(c) ≤ 0x5b) || (0x5d ≤ Int(c) ≤ 0x10ffff))], identity),
  (:Char    => [:Escaped], c -> string(c)),
  (:Escaped => ['\\', Match.OneOf("\"\\/bfnrt")], (_,c) -> unescape_string("\\$c")),
  (:Escaped => ['\\', 'u', Match.HexDigit(), Match.HexDigit(), Match.HexDigit(), Match.HexDigit()], (_,_,cs...) -> unescape_string("\\u" * String(collect(cs)))),
])

end

A  => test/runtests.jl +580 -0
@@ 1,580 @@
using Test
using Earley
include("json.jl")

@testset "nullables()" begin
  g1 = CFG([
    :A => ['a'],
    :A => ['a', :A],
    :B => ['b'],
    :B => [:B, 'b']
  ])
  @test Earley.nullables(g1) == Set([])

  g2 = CFG([
    :A => [],
    :B => [:A, 'a'],
    :C => [:A, 'c'],
    :C => [:A, :A],
  ])
  @test Earley.nullables(g2) == Set([:A, :C])

  g3 = CFG([
    :A => [:D],
    :B => [:A],
    :C => [:B],
    :D => [:C],
    :D => []
  ])
  @test Earley.nullables(g3) == Set([:A, :B, :C, :D])

  g4 = CFG([
    :A => [:A]
  ])
  @test Earley.nullables(g4) == Set([])

  g5 = CFG([
    :A => [:D, :C, :B],
    :B => [:A, :A],
    :C => [:A, :B, :D],
    :D => [:D],
    :D => [:C, :C, :B],
    :A => []
  ])
  @test Earley.nullables(g5) == Set([:A, :B])

  g6 = CFG([
    :A => [],
    :A => [:A, 'a'],
    :B => [:A],
    :B => [:B, 'b'],
    :C => [:A, 'c', :B],
    :C => [:B, 'c', :A],
    :D => [:B],
    :D => [:C],
    :D => [:D, 'd']
  ])
  @test Earley.nullables(g6) == Set([:A, :B, :D])
end

@testset "Match" begin
  @testset "OneOf" begin
    @test matches(Match.OneOf("0123456789"), '0')
    @test matches(Match.OneOf("0123456789"), '3')
    @test matches(Match.OneOf("0123456789"), '8')
    @test !matches(Match.OneOf("0123456789"), 'x')
    @test !matches(Match.OneOf("0123456789"), "nine")

    @test matches(Match.OneOf(["if", "true", "false", "else"]), "if")
    @test matches(Match.OneOf(["if", "true", "false", "else"]), "true")
    @test !matches(Match.OneOf(["if", "true", "false", "else"]), "bool")
  end

  @testset "ASCII" begin
    @test matches(Match.ASCII(), 'a')
    @test matches(Match.ASCII(), 'X')
    @test matches(Match.ASCII(), '\r')
    @test matches(Match.ASCII(), ' ')
    @test matches(Match.ASCII(), '.')
    @test matches(Match.ASCII(), '0')
    @test matches(Match.ASCII(), '8')
    @test !matches(Match.ASCII(), 'α')
    @test !matches(Match.ASCII(), 'Σ')
    @test !matches(Match.ASCII(), 'す')
    @test !matches(Match.ASCII(), '🙈')
  end

  @testset "Space" begin
    @test !matches(Match.Space(), 'a')
    @test !matches(Match.Space(), 'X')
    @test matches(Match.Space(), '\r')
    @test matches(Match.Space(), ' ')
    @test !matches(Match.Space(), '.')
    @test !matches(Match.Space(), '0')
    @test !matches(Match.Space(), '8')
    @test !matches(Match.Space(), 'α')
    @test !matches(Match.Space(), 'Σ')
    @test !matches(Match.Space(), 'す')
    @test !matches(Match.Space(), '🙈')
  end

  @testset "Digit" begin
    @test !matches(Match.Digit(), 'a')
    @test !matches(Match.Digit(), 'X')
    @test !matches(Match.Digit(), '\r')
    @test !matches(Match.Digit(), ' ')
    @test !matches(Match.Digit(), '.')
    @test matches(Match.Digit(), '0')
    @test matches(Match.Digit(), '8')
    @test !matches(Match.Digit(), 'α')
    @test !matches(Match.Digit(), 'Σ')
    @test !matches(Match.Digit(), 'す')
    @test !matches(Match.Digit(), '🙈')
  end

  @testset "Letter" begin
    @test matches(Match.Letter(), 'a')
    @test matches(Match.Letter(), 'X')
    @test !matches(Match.Letter(), '\r')
    @test !matches(Match.Letter(), ' ')
    @test !matches(Match.Letter(), '.')
    @test !matches(Match.Letter(), '0')
    @test !matches(Match.Letter(), '8')
    @test matches(Match.Letter(), 'α')
    @test matches(Match.Letter(), 'Σ')
    @test matches(Match.Letter(), 'す')
    @test !matches(Match.Letter(), '🙈')
  end

  @testset "Lower" begin
    @test matches(Match.Lower(), 'a')
    @test !matches(Match.Lower(), 'X')
    @test !matches(Match.Lower(), '\r')
    @test !matches(Match.Lower(), ' ')
    @test !matches(Match.Lower(), '.')
    @test !matches(Match.Lower(), '0')
    @test !matches(Match.Lower(), '8')
    @test matches(Match.Lower(), 'α')
    @test !matches(Match.Lower(), 'Σ')
    @test !matches(Match.Lower(), 'す')
    @test !matches(Match.Lower(), '🙈')
  end

  @testset "Upper" begin
    @test !matches(Match.Upper(), 'a')
    @test matches(Match.Upper(), 'X')
    @test !matches(Match.Upper(), '\r')
    @test !matches(Match.Upper(), ' ')
    @test !matches(Match.Upper(), '.')
    @test !matches(Match.Upper(), '0')
    @test !matches(Match.Upper(), '8')
    @test !matches(Match.Upper(), 'α')
    @test matches(Match.Upper(), 'Σ')
    @test !matches(Match.Upper(), 'す')
    @test !matches(Match.Upper(), '🙈')
  end

  @testset "Print" begin
    @test matches(Match.Print(), 'a')
    @test matches(Match.Print(), 'X')
    @test !matches(Match.Print(), '\r')
    @test matches(Match.Print(), ' ')
    @test matches(Match.Print(), '.')
    @test matches(Match.Print(), '0')
    @test matches(Match.Print(), '8')
    @test matches(Match.Print(), 'α')
    @test matches(Match.Print(), 'Σ')
    @test matches(Match.Print(), 'す')
    @test matches(Match.Print(), '🙈')
  end

  @testset "HexDigit" begin
    @test matches(Match.HexDigit(), 'a')
    @test !matches(Match.HexDigit(), 'X')
    @test !matches(Match.HexDigit(), '\r')
    @test !matches(Match.HexDigit(), ' ')
    @test !matches(Match.HexDigit(), '.')
    @test matches(Match.HexDigit(), '0')
    @test matches(Match.HexDigit(), '8')
    @test !matches(Match.HexDigit(), 'α')
    @test !matches(Match.HexDigit(), 'Σ')
    @test !matches(Match.HexDigit(), 'す')
    @test !matches(Match.HexDigit(), '🙈')
  end

  @testset "AnyToken" begin
    @test matches(Match.AnyToken(), 'a')
    @test matches(Match.AnyToken(), 'X')
    @test matches(Match.AnyToken(), '\r')
    @test matches(Match.AnyToken(), ' ')
    @test matches(Match.AnyToken(), '.')
    @test matches(Match.AnyToken(), '0')
    @test matches(Match.AnyToken(), '8')
    @test matches(Match.AnyToken(), 'α')
    @test matches(Match.AnyToken(), 'Σ')
    @test matches(Match.AnyToken(), 'す')
    @test matches(Match.AnyToken(), '🙈')
    @test matches(Match.AnyToken(), "false")
    @test matches(Match.AnyToken(), 25)
  end

  @testset "Predicate" begin
    p(c) = isletter(c) || isdigit(c)
    M = Match.Predicate(p)
    @test matches(M, 'a')
    @test matches(M, 'X')
    @test !matches(M, '\r')
    @test !matches(M, ' ')
    @test !matches(M, '.')
    @test matches(M, '0')
    @test matches(M, '8')
    @test matches(M, 'α')
    @test matches(M, 'Σ')
    @test matches(M, 'す')
    @test !matches(M, '🙈')
  end

  @testset "Advanced OneOf" begin
    @test matches(Match.OneOf([Match.Lower(), Match.Digit()]), 'a')
    @test matches(Match.OneOf([Match.Lower(), Match.Digit()]), '0')
    @test matches(Match.OneOf([Match.Lower(), Match.Digit()]), '9')
    @test matches(Match.OneOf([Match.Lower(), Match.Digit()]), 'x')
    @test matches(Match.OneOf([Match.Lower(), Match.Digit()]), 'α')
    @test !matches(Match.OneOf([Match.Lower(), Match.Digit()]), 'A')
    @test !matches(Match.OneOf([Match.Lower(), Match.Digit()]), 'Z')
    @test !matches(Match.OneOf([Match.Lower(), Match.Digit()]), '!')

    @test matches(Match.OneOf([Match.OneOf(["true", "false"]), "bool"]), "true")
    @test matches(Match.OneOf([Match.OneOf(["true", "false"]), "bool"]), "bool")
    @test !matches(Match.OneOf([Match.OneOf(["true", "false"]), "bool"]), "int")
  end
end

@testset "recognize(::CFG)" begin
  grammar_par1 = CFG([
    :Par => [], # matches empty string
    :Par => ['(', :Par, ')'],
  ])
  @test recognize(grammar_par1, "")
  @test recognize(grammar_par1, "()")
  @test !recognize(grammar_par1, "(")
  @test !recognize(grammar_par1, ")")
  @test !recognize(grammar_par1, ")(")
  @test !recognize(grammar_par1, "(12)")
  @test recognize(grammar_par1, "((()))")
  @test !recognize(grammar_par1, "()()")
  @test recognize(grammar_par1, "(((((((((((((((((((((((((((((())))))))))))))))))))))))))))))")
  @test !recognize(grammar_par1, "((((((((((((((((((((((((((((())))))))))))))))))))))))))))))")
  @test !recognize(grammar_par1, "(((((((((((((((((((((((((((((()))))))))))))))))))))))))))))")

  grammar_par2 = CFG([
      :Par => [],
      :Par => ['(', :Par, ')'],
      :Par => ['[', :Par, ']'],
  ])
  @test recognize(grammar_par2, "")
  @test recognize(grammar_par2, "()")
  @test recognize(grammar_par2, "[]")
  @test !recognize(grammar_par2, "][")
  @test !recognize(grammar_par2, "12")
  @test recognize(grammar_par2, "(([()]))")
  @test recognize(grammar_par2, "([[([(())])]])")

  # left-recursive grammar matching an even number of 'a' characters
  grammar_a1 = CFG([
      :A => [],
      :A => [:A, 'a', 'a']
    ])
  @test recognize(grammar_a1, "")
  @test !recognize(grammar_a1, "a")
  @test recognize(grammar_a1, "aa")
  @test !recognize(grammar_a1, "aaa")
  @test recognize(grammar_a1, "aaaa")

  # right-recursive grammar matching an even number of 'a' characters
  grammar_a1 = CFG([
      :A => [],
      :A => ['a', 'a', :A]
    ])
  @test recognize(grammar_a1, "")
  @test !recognize(grammar_a1, "a")
  @test recognize(grammar_a1, "aa")
  @test !recognize(grammar_a1, "aaa")
  @test recognize(grammar_a1, "aaaa")

  # a simultaneously left-recursive, right-recursive, and ambiguous grammar
  grammar_a = CFG([
      :A => ['a'],
      :A => [:A, :A]
    ])
  @test !recognize(grammar_a, "")
  @test recognize(grammar_a, "a")
  @test !recognize(grammar_a, "ab")
  @test recognize(grammar_a, "aa")
  @test recognize(grammar_a, "aaa")
  @test recognize(grammar_a, "aaaa")
  @test recognize(grammar_a, "aaaaa")
  @test recognize(grammar_a, "aaaaaa")

  grammar_cyclic = CFG([
    :A => [:A]
  ])
  @test !recognize(grammar_cyclic, "")
  @test !recognize(grammar_cyclic, "a")
  @test !recognize(grammar_cyclic, "abc")
  @test !recognize(grammar_cyclic, "1234567890")

  grammar_nullable = CFG([
    :A => ['a'],
    :A => [:B, :A],
    :B => [],
    :B => [:A]
  ])
  @test recognize(grammar_nullable, "a")
  @test recognize(grammar_nullable, "aaa")

  grammar_ab = CFG([
    :AB => [Match.OneOf("ab")],
    :AB => [:AB, :AB]
  ])
  @test !recognize(grammar_ab, "")
  @test recognize(grammar_ab, "ab")
  @test recognize(grammar_ab, "ba")
  @test recognize(grammar_ab, "aa")
  @test recognize(grammar_ab, "bbbbbb")
  @test recognize(grammar_ab, "abababaabbba")
  @test recognize(grammar_ab, "abababaabbbaababbabababababababbababaaaaabbabbbbbb")
  @test !recognize(grammar_ab, "acb")

  grammar_decimal = CFG([
    :Decimal => [:Nonnegative],
    :Decimal => [Match.OneOf("+-"), :Nonnegative],
    :Nonnegative => [Match.Digit()],
    :Nonnegative => [Match.OneOf("123456789"), :Digits],
    :Digits => [],
    :Digits => [:Digits, Match.Digit()],
  ])
  @test !recognize(grammar_decimal, "")
  @test recognize(grammar_decimal, "0")
  @test recognize(grammar_decimal, "1")
  @test recognize(grammar_decimal, "2")
  @test recognize(grammar_decimal, "-0")
  @test recognize(grammar_decimal, "+5")
  @test recognize(grammar_decimal, "12389348279824792837492")
  @test recognize(grammar_decimal, "+12389348279824792837492")
  @test recognize(grammar_decimal, "-12389348279824792837492")
  @test !recognize(grammar_decimal, "09")
  @test !recognize(grammar_decimal, "023")
  @test !recognize(grammar_decimal, "0x42")
end

@testset "recognize(::CFG) large grammars" begin
  json = JSON.grammar.cfg

  @test Earley.nullables(json) == Set([:WS, :Members, :Values, Symbol("Optional Minus"), Symbol("Optional Fractional Part"), Symbol("Optional Exponent"), Symbol("Optional Sign"), :Chars])

  @test recognize(json, "{}")
  @test recognize(json, "[]")
  @test recognize(json, "[true]")
  @test recognize(json, """[""]""")
  @test recognize(json, """["green\\nしろ"]""")
  @test recognize(json, "[1]")
  @test recognize(json, "[-0]")
  @test recognize(json, "[123456789012345678901234567890]")
  @test recognize(json, "[2e6]")
  @test recognize(json, "[3.5]")
  @test recognize(json, "[-12.0]")
  @test recognize(json, "[true, null, false]")
  @test recognize(json, """{"twelve": null}""")
  @test recognize(json, """{ "α" :   7.3e-3\n,\n"R": 0.12820 }""")
  @test !recognize(json, "12")
  @test !recognize(json, "\"yellow\"")
  @test !recognize(json, "{true}")
  @test !recognize(json, "[FALSE]")
end

@testset "recognize(::Grammar)" begin
  grammar_par1 = Grammar([
    (:Par => [],               ()      -> error()),
    (:Par => [:Par, :Par],     (_,_)   -> error()),
    (:Par => ['(', :Par, ')'], (_,_,_) -> error()),
  ])
  @test recognize(grammar_par1, "")
  @test recognize(grammar_par1, "()")
  @test !recognize(grammar_par1, "(")
  @test !recognize(grammar_par1, ")")
  @test !recognize(grammar_par1, ")(")
  @test !recognize(grammar_par1, "(12)")
  @test recognize(grammar_par1, "((()))")
  @test recognize(grammar_par1, "()()")
  @test recognize(grammar_par1, "(((()()(()))))")
  @test !recognize(grammar_par1, "((((()(()))))")
  @test recognize(grammar_par1, "(((((((((((((((((((((((((((((())))))))))))))))))))))))))))))")
  @test !recognize(grammar_par1, "((((((((((((((((((((((((((((())))))))))))))))))))))))))))))")
  @test !recognize(grammar_par1, "(((((((((((((((((((((((((((((()))))))))))))))))))))))))))))")
end

@testset "checkgrammar()" begin
  g1 = CFG([
    :A => [:A],
    :A => []
  ])
  @test_throws ErrorException Earley.checkgrammar(g1)

  g2 = CFG([
    :A => [],
    :A => [:A]
  ])
  @test_throws ErrorException Earley.checkgrammar(g2)

  g3 = CFG([
    :A => [:B],
    :B => [:A],
    :A => ['a'],
    :B => ['b'],
  ])
  @test_throws ErrorException Earley.checkgrammar(g3)

  g4 = CFG([
    :A => [:A, 'a'],
    :A => ['a', :A],
    :A => [:A],
    :A => ['a']
  ])
  @test_throws ErrorException Earley.checkgrammar(g4)

  g5 = CFG([
    :A => [:B, :B],
    :B => ['a', 'b'],
    :B => [:A, :B],
    :A => [],
    :B => [:A, :A, :A],
  ])
  @test_throws ErrorException Earley.checkgrammar(g5)

  g6 = Grammar([
    (:A => [:A],  identity),
    (:A => ['a'], identity)
  ])
  @test_throws ErrorException Earley.checkgrammar(g6)

  g7 = Grammar([
    (:A => [:B],  identity),
    (:B => [:A],  identity),
    (:A => ['a'], identity),
    (:B => ['b'], identity)
  ])
  @test_throws ErrorException Earley.checkgrammar(g7)
end

@testset "parse(::CFG)" begin
  grammar_par = CFG([
      :Par => [],
      :Par => ['(', :Par, ')'],
      :Par => ['[', :Par, ']'],
  ])
  @testset "parens, \"\"" begin
    tree = parse(grammar_par, "")
    @test tree.rule == 1 && tree.start == 1 && tree.stop == 1 && isempty(tree.children)
  end
  @testset "parens, \"()\"" begin
    tree = parse(grammar_par, "()")
    @test tree.rule == 2 && tree.start == 1 && tree.stop == 3 && length(tree.children) == 3
    (child1, child2, child3) = tree.children
    @test child1.rule == 0 && child1.start == 1 && child1.stop == 2 && isempty(child1.children)
    @test child2.rule == 1 && child2.start == 2 && child2.stop == 2 && isempty(child2.children)
    @test child3.rule == 0 && child3.start == 2 && child3.stop == 3 && isempty(child3.children)
  end
  @testset "parens, \"([])\"" begin
    tree = parse(grammar_par, "([])")
    @test tree.rule == 2 && tree.start == 1 && tree.stop == 5 && length(tree.children) == 3
    (child1, child2, child3) = tree.children
    @test child1.rule == 0 && child1.start == 1 && child1.stop == 2 && isempty(child1.children)
    @test child2.rule == 3 && child2.start == 2 && child2.stop == 4 && length(child2.children) == 3
    @test child3.rule == 0 && child3.start == 4 && child3.stop == 5 && isempty(child3.children)
    (child21, child22, child23) = child2.children
    @test child21.rule == 0 && child21.start == 2 && child21.stop == 3 && isempty(child21.children)
    @test child22.rule == 1 && child22.start == 3 && child22.stop == 3 && isempty(child22.children)
    @test child23.rule == 0 && child23.start == 3 && child23.stop == 4 && isempty(child23.children)
  end
end

@testset "parse(::Grammar)" begin

  @testset "left/right recursive" begin
    gl = Grammar([
      (:A => [],        ()    -> []),
      (:A => [:A, 'a'], (e,a) -> [e, a])
    ])
    gr = Grammar([
      (:A => [],        ()    -> []),
      (:A => ['a', :A], (a,e) -> [a, e])
    ])
    glr = Grammar([
      (:A => [],            ()        -> []),
      (:A => [:A, 'a', :A], (e1,a,e2) -> [e1, a, e2])
    ])

    @test parse(gl,  "") == []
    @test parse(gr,  "") == []
    @test parse(glr, "") == []

    @test parse(gl,  "a") == [[], 'a']
    @test parse(gr,  "a") == ['a', []]
    @test parse(glr, "a") == [[], 'a', []]

    @test parse(gl,  "aa") == [[[], 'a'], 'a']
    @test parse(gr,  "aa") == ['a', ['a', []]]
    @test parse(glr, "aa") == [[], 'a', [[], 'a', []]]

    @test parse(gl,  "aaaaa") == [[[[[[], 'a'], 'a'], 'a'], 'a'], 'a']
    @test parse(gr,  "aaaaa") == ['a', ['a', ['a', ['a', ['a', []]]]]]
    #@test parse(glr, "aaaaa")
  end

  @testset "dangling else" begin
    # Two grammars for a trivial C-like syntax with dangling else ambiguity.
    # Depending on the order of the third and fourth rule, the ambiguity is resolved differently.
    ra = [
      (:Block => ["{}"],                           (_)       -> []),
      (:Block => [:If],                            e         -> e),
      (:If    => ["if()", :Block],                 (_,b)     -> ["if()", b]),
      (:If    => ["if()", :Block, "else", :Block], (_,c,_,a) -> ["if()", c, a])
    ]
    g_inner = Grammar{String}(ra)
    g_outer = Grammar{String}([ra[1], ra[2], ra[4], ra[3]])

    @test parse(g_inner, ["{}"]) == []
    @test parse(g_outer, ["{}"]) == []

    @test parse(g_inner, ["if()", "{}"]) == ["if()", []]
    @test parse(g_outer, ["if()", "{}"]) == ["if()", []]

    @test parse(g_inner, ["if()", "{}", "else", "{}"]) == ["if()", [], []]
    @test parse(g_outer, ["if()", "{}", "else", "{}"]) == ["if()", [], []]

    @test parse(g_inner, ["if()", "if()", "{}"]) == ["if()", ["if()", []]]
    @test parse(g_outer, ["if()", "if()", "{}"]) == ["if()", ["if()", []]]

    # This is where the ambiguity comes into play
    @test parse(g_inner, ["if()", "if()", "{}", "else", "{}"]) == ["if()", ["if()", [], []]]
    @test parse(g_outer, ["if()", "if()", "{}", "else", "{}"]) == ["if()", ["if()", []], []]
  end

  @testset "parse(::Grammar) large grammars" begin
    json = JSON.grammar
    @test parse(json, "{}") == Dict()
    @test parse(json, "[]") == []
    @test parse(json, "[123456789012345678901234567890]") == [123456789012345678901234567890]
    @test parse(json, "[-0]") == [0]
    @test parse(json, "[true, false, null]") == [true, false, nothing]
    @test parse(json, """{"twelve": null}""") == Dict("twelve" => nothing)
    @test parse(json, """{"\\r": [{}], "\\u03b1": "true"}""") == Dict("\r" => [Dict()], "α" => "true")
    @test isapprox(parse(json, "[3.14195e5, 13e-2]"), [314195, 0.13])
    @test parse(json, collect("""{ "α" :   "7.3e-3"\n,\n"R": "0.12820" }""")) == Dict("α" => "7.3e-3", "R" => "0.12820")
  end

  @testset "mixed associativity" begin
    g = Grammar([ # A simple arithmetic grammar with mixed associativity.
      (:expression => [:sum],      identity),
      (:expression => [:product],  identity),
      (:sum => [:sum, '+', :product], (e1,_,e2) -> Expr(:call, :+, e1, e2)),
      (:sum => [:sum, '-', :product], (e1,_,e2) -> Expr(:call, :-, e1, e2)),
      (:sum => [:product],                   identity),
      (:product => [:factor],                identity),
      (:product => [:product, '*', :factor], (e1,_,e2) -> Expr(:call, :*, e1, e2)),
      (:factor => [:number], identity),
      (:factor => [:power],  identity),
      (:factor => ['(', :expression, ')'], (_,e,_) -> e),
      (:number => [Match.Digit()], c -> c-'0'),
      (:power => [:factor, '^', :factor], (e1,_,e2) -> Expr(:call, :^, e1, e2)),
    ]);

    @test parse(g, "1-2-3") == :((1-2)-3)
    @test parse(g, "2*3*4") == :((2*3)*4)
    @test parse(g, "2^3^4") == :(2^(3^4))
    @test parse(g, "1+2-3+4") == :(((1+2)-3)+4)
    @test parse(g, "2*3^4^(5+6)*7") == :((2*3^(4^(5+6)))*7)
    @test parse(g, "1-2*3^4+5") == :((1-2*3^4)+5)
  end
end