A => .builds/archlinux.yml +19 -0
@@ 1,19 @@
+# build manifest for sr.ht
+image: archlinux
+packages:
+ - gnupg
+ - julia
+sources:
+ - https://git.sr.ht/~quf/Earley.jl
+tasks:
+ - run-tests-current: |
+ cd Earley.jl
+ julia --project=. -e 'import Pkg; Pkg.test()'
+ - download-lts: |
+ gpg --import - < Earley.jl/.builds/juliareleases.asc
+ curl -o julia-1.6.7-linux-x86_64.tar.gz 'https://julialang-s3.julialang.org/bin/linux/x64/1.6/julia-1.6.7-linux-x86_64.tar.gz'
+ curl 'https://julialang-s3.julialang.org/bin/linux/x64/1.6/julia-1.6.7-linux-x86_64.tar.gz.asc' | gpg --verify --trust-model always - julia-1.6.7-linux-x86_64.tar.gz
+ tar xzf julia-1.6.7-linux-x86_64.tar.gz
+ - run-tests-lts: |
+ cd Earley.jl
+ $HOME/julia-1.6.7/bin/julia --project=. -e 'import Pkg; Pkg.update(); Pkg.test()'
A => .builds/juliareleases.asc +52 -0
@@ 1,52 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+Version: GnuPG v1
+
+mQINBFXxFlcBEADQDEBFlzoyehPuk13Ct928WwBvb0q9OKyjz2NlYq3sL5ReTbQB
+9P5hyl68q5iJ6QTjKEaxr+Kmjhib9dQGZhtBXRa9q185Fdav48rS9rDKR5/aPXNi
+4aA0BSp7fHIDrTUGOUMB5TFpVZil+Sz4llpPKDlgG70dn3ZLBznJQKUXJWhxrheG
+ogUK4W3WAdBBPDVraPjBjvTTSrhoOBJh/oNib3J6xTIaUMhOFz+Vuq05BZI9UO6n
+OsE3dSW7X7dvqjcN3Ti7TgbJD5d4iOsQl8NhqItyS8ZULV8TPGOuwitoWxqgFIAL
+5bhM9Of4xOE0+rmgke1dKmMkq3cu6yCEFypqyxwShexe+1Mvx4Tn4/OqC7wFVpTA
+IH2ys7NsVcoLtZGqlBQnbXFmIu9ay51Zb4wwbJ5Qr9Rfx5xPvJoOVUpP/0I8+vlI
+CmBkP6vs9vMCCKcreP0FpjCTSRApv9IXuwjumOMb6P0GJPOuFVfsy4849ONPC/yM
+dMbeopi/BWfHu/Nqt7pqY210jncsdBPlPy7LvvhIkbpeZHQDoQVDPX88ZylhqKTy
+gpWPBT5ezJ5ib0nSvYIZjMOMlMWxDaNDBGZlyHizVFwLZk6qHWM7I2WbJGvNgBTv
+0dX9jBIDhdKdSZjc3wxh+nqZQg1l8xOOx9yCLSiBL1OHf4PYqJudL09AUwARAQAB
+tDNKdWxpYSAoQmluYXJ5IHNpZ25pbmcga2V5KSA8YnVpbGRib3RAanVsaWFsYW5n
+Lm9yZz6JAjgEEwECACIFAlXxFlcCGwMGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheA
+AAoJEGbjx9wD1uSVg78QAJZUeygDHj1zTxt+8UAm4TMu0nWmcPjSzTGj5Wt4Gtec
+HlWsXTOvFbABv8r3vzD2W1Bi0D0UcUucBy3Jf0nrUBWY89VTREcG/EWsF2SwSB7H
+cL3pu+vcdLiVtRGI4AiSoZz2CXc4vHY0X/3TlPejcO0UU8A0Ukth/cX1ZqCjKP8T
+ciXy89X4mlRAsAXapkHxiO+bscTd/VdWaPaUx8/TxeFoPZFB/0FIeJHYbI1chKPd
+vAtFYLpB89d8zbQYgISM6oc/f1j0CQR6JdHGoAGP9Wd8wRz+mDT3WzOqL4jXctcA
+CQUKGgYkOW8OEFBlfUACZK5uFxWMktN8//IlzczCTbYb9Z89UeeF7oaXfSZMFwiF
+kxseUGCceXb5Kqj3fZKmmUstAEzycyNuCeXG1KXyAz1mg/ihq/rzB11vQQjY4WYJ
+rIoUecRN3btSex6jcdOxAIOeGcyfigT7NMgplFXXkbuux2N7qtOkLUNx80DMOggK
+tnSP60GkO1xzJLi3EHtaDVPU59KpeXjyEsNB2ngc5+LwHwbYGvaaZaFXFm7oCmM7
+xG88EU14mCLZbpGleD6cmpVAprFSIXV0Z0xm6pdH9XBCT4UJ8tFXTrJsc1dYd+mw
+eAwCYZ38e95kqrYrRbhjOOAKEtf3t4VnrsifbTfTVclUbsrSXVTQdHoiMlODc/WX
+uQINBFXxFlcBEADNmFCh53NJ+8CQSzQda/efBX+H/SCj2b3vIYJXY2nR9h4IQ7UV
+/AU5sUB/bpIN3nwwdcILYSm2oJGP8fZ8Zf46XliUOK8+yD8ApDg6okl3R1G+E9Qk
+/EN49BCeXx9uT5vHpcHWkBvKmqmjUJ283i6q3QT5qzbkCGGUQ7SyhU1ywbjYIQi/
+HLJpntqz44LrM+vfGUAa+CJld3DyzAm66KFSRbDU12XPE948MxUDQ1NgY9hJIlfm
+ud/ShKakfQoEsLiTkUbEY7Vc19s2+aM3S1zeRfsatuayPuEUsnuz42wKWSdPNGyJ
+TkLdWz46vSgN9wpe0OLoWxsuomaViRaNFDSK7Uo+AGjWcjFNlehFlW/ELji1JbS5
+f5EAD1A1I2RJvLHyri3xFJtM9qbGiA3ZIfcVXq5RxAOehDPCcKzBS4w37D2vLBOQ
+Xa+ExTJxwiCnMPuo7acsfkyleakAe82L/fAoVWdPcFSjq3KFvkpGpTlvvh2jwhoW
+AgDGu77K9T1rHjj7t2GjuR71RVc4r0CP9iF3rAPmq/FapONW1Pz0aom7XLBZt8Zq
+4wsPsGaAECmwi07bE6Vr9nqCeQb7XmjVucVJP+VXDpOJzt4J5zSzTCWGyj47/K7a
+Rlz9KtYmY0s4sKnx3sjKpC8xMXaLgvSjudrQCZ/sohKRayKGAMI2p71GbQARAQAB
+iQIfBBgBAgAJBQJV8RZXAhsMAAoJEGbjx9wD1uSV6+oP/3MCyMWEBiu73HVI2dS2
+hDct/E9fDkpB6o/HEGhdNFTeeb/L7GqcQACJDtBDNVtMu0WhCgKeteHXM0KMy55f
+6HAQEVnWhGSyR4KksV93RPZvUO+zzX5M7F2LiI59MSruKAYTC0kXbjcu9aQAn+kJ
+EPHiHwsTzRkWh90q54/B2NQ6oVAHgnMIeh32OBdFMNHOnP+n1zu/+Wd4miC3fR9V
+tmsVrOS8WtozdEC6TmquYswQ/gT6c0afCZSlNF/ZPPrXGGdD6t9WTJntfYB1rbEk
+E/9WpaUgpKpxXQEOMzMAm+2yBoYnCpXzvbY6fzNWfOg6DJ65t0rkrCwDRHLH1grA
+61OQb0Ou8LQnrFGox8L394sFebIoaBUk2Vhw5LH78X6g1f7Mj6j9Er0YSabVVpHh
+ncMYflOeswrV4C1oP5UvL7K3qtCixUU4LQ4XqmioQey8AnrCdJ7S5QeyP1n5vU3e
+Nz1JHCcH4/e698CuIoCZa86Edmo3S0O2hhiC5qslf5u1pdndlmbrgsWpBH5kJ7mI
+edeA2ND/KrLlllE7NImLdlrciShctFP1ciqqHtTebQ+5MH17ObOhSptUDEt5LjZt
+3YXZtQ+C/UmfkC+QVUdWTQ4cWUCNtuzLP+PW3o1AQHmijWbaECq5yMRVlr7JuxPr
+Lr+fAJHZvbYCQjMTkZYScgYU
+=XN/B
+-----END PGP PUBLIC KEY BLOCK-----
A => .gitignore +1 -0
A => LICENSE.txt +287 -0
@@ 1,287 @@
+ EUROPEAN UNION PUBLIC LICENCE v. 1.2
+ EUPL © the European Union 2007, 2016
+
+This European Union Public Licence (the ‘EUPL’) applies to the Work (as defined
+below) which is provided under the terms of this Licence. Any use of the Work,
+other than as authorised under this Licence is prohibited (to the extent such
+use is covered by a right of the copyright holder of the Work).
+
+The Work is provided under the terms of this Licence when the Licensor (as
+defined below) has placed the following notice immediately following the
+copyright notice for the Work:
+
+ Licensed under the EUPL
+
+or has expressed by any other means his willingness to license under the EUPL.
+
+1. Definitions
+
+In this Licence, the following terms have the following meaning:
+
+- ‘The Licence’: this Licence.
+
+- ‘The Original Work’: the work or software distributed or communicated by the
+ Licensor under this Licence, available as Source Code and also as Executable
+ Code as the case may be.
+
+- ‘Derivative Works’: the works or software that could be created by the
+ Licensee, based upon the Original Work or modifications thereof. This Licence
+ does not define the extent of modification or dependence on the Original Work
+ required in order to classify a work as a Derivative Work; this extent is
+ determined by copyright law applicable in the country mentioned in Article 15.
+
+- ‘The Work’: the Original Work or its Derivative Works.
+
+- ‘The Source Code’: the human-readable form of the Work which is the most
+ convenient for people to study and modify.
+
+- ‘The Executable Code’: any code which has generally been compiled and which is
+ meant to be interpreted by a computer as a program.
+
+- ‘The Licensor’: the natural or legal person that distributes or communicates
+ the Work under the Licence.
+
+- ‘Contributor(s)’: any natural or legal person who modifies the Work under the
+ Licence, or otherwise contributes to the creation of a Derivative Work.
+
+- ‘The Licensee’ or ‘You’: any natural or legal person who makes any usage of
+ the Work under the terms of the Licence.
+
+- ‘Distribution’ or ‘Communication’: any act of selling, giving, lending,
+ renting, distributing, communicating, transmitting, or otherwise making
+ available, online or offline, copies of the Work or providing access to its
+ essential functionalities at the disposal of any other natural or legal
+ person.
+
+2. Scope of the rights granted by the Licence
+
+The Licensor hereby grants You a worldwide, royalty-free, non-exclusive,
+sublicensable licence to do the following, for the duration of copyright vested
+in the Original Work:
+
+- use the Work in any circumstance and for all usage,
+- reproduce the Work,
+- modify the Work, and make Derivative Works based upon the Work,
+- communicate to the public, including the right to make available or display
+ the Work or copies thereof to the public and perform publicly, as the case may
+ be, the Work,
+- distribute the Work or copies thereof,
+- lend and rent the Work or copies thereof,
+- sublicense rights in the Work or copies thereof.
+
+Those rights can be exercised on any media, supports and formats, whether now
+known or later invented, as far as the applicable law permits so.
+
+In the countries where moral rights apply, the Licensor waives his right to
+exercise his moral right to the extent allowed by law in order to make effective
+the licence of the economic rights here above listed.
+
+The Licensor grants to the Licensee royalty-free, non-exclusive usage rights to
+any patents held by the Licensor, to the extent necessary to make use of the
+rights granted on the Work under this Licence.
+
+3. Communication of the Source Code
+
+The Licensor may provide the Work either in its Source Code form, or as
+Executable Code. If the Work is provided as Executable Code, the Licensor
+provides in addition a machine-readable copy of the Source Code of the Work
+along with each copy of the Work that the Licensor distributes or indicates, in
+a notice following the copyright notice attached to the Work, a repository where
+the Source Code is easily and freely accessible for as long as the Licensor
+continues to distribute or communicate the Work.
+
+4. Limitations on copyright
+
+Nothing in this Licence is intended to deprive the Licensee of the benefits from
+any exception or limitation to the exclusive rights of the rights owners in the
+Work, of the exhaustion of those rights or of other applicable limitations
+thereto.
+
+5. Obligations of the Licensee
+
+The grant of the rights mentioned above is subject to some restrictions and
+obligations imposed on the Licensee. Those obligations are the following:
+
+Attribution right: The Licensee shall keep intact all copyright, patent or
+trademarks notices and all notices that refer to the Licence and to the
+disclaimer of warranties. The Licensee must include a copy of such notices and a
+copy of the Licence with every copy of the Work he/she distributes or
+communicates. The Licensee must cause any Derivative Work to carry prominent
+notices stating that the Work has been modified and the date of modification.
+
+Copyleft clause: If the Licensee distributes or communicates copies of the
+Original Works or Derivative Works, this Distribution or Communication will be
+done under the terms of this Licence or of a later version of this Licence
+unless the Original Work is expressly distributed only under this version of the
+Licence — for example by communicating ‘EUPL v. 1.2 only’. The Licensee
+(becoming Licensor) cannot offer or impose any additional terms or conditions on
+the Work or Derivative Work that alter or restrict the terms of the Licence.
+
+Compatibility clause: If the Licensee Distributes or Communicates Derivative
+Works or copies thereof based upon both the Work and another work licensed under
+a Compatible Licence, this Distribution or Communication can be done under the
+terms of this Compatible Licence. For the sake of this clause, ‘Compatible
+Licence’ refers to the licences listed in the appendix attached to this Licence.
+Should the Licensee's obligations under the Compatible Licence conflict with
+his/her obligations under this Licence, the obligations of the Compatible
+Licence shall prevail.
+
+Provision of Source Code: When distributing or communicating copies of the Work,
+the Licensee will provide a machine-readable copy of the Source Code or indicate
+a repository where this Source will be easily and freely available for as long
+as the Licensee continues to distribute or communicate the Work.
+
+Legal Protection: This Licence does not grant permission to use the trade names,
+trademarks, service marks, or names of the Licensor, except as required for
+reasonable and customary use in describing the origin of the Work and
+reproducing the content of the copyright notice.
+
+6. Chain of Authorship
+
+The original Licensor warrants that the copyright in the Original Work granted
+hereunder is owned by him/her or licensed to him/her and that he/she has the
+power and authority to grant the Licence.
+
+Each Contributor warrants that the copyright in the modifications he/she brings
+to the Work are owned by him/her or licensed to him/her and that he/she has the
+power and authority to grant the Licence.
+
+Each time You accept the Licence, the original Licensor and subsequent
+Contributors grant You a licence to their contributions to the Work, under the
+terms of this Licence.
+
+7. Disclaimer of Warranty
+
+The Work is a work in progress, which is continuously improved by numerous
+Contributors. It is not a finished work and may therefore contain defects or
+‘bugs’ inherent to this type of development.
+
+For the above reason, the Work is provided under the Licence on an ‘as is’ basis
+and without warranties of any kind concerning the Work, including without
+limitation merchantability, fitness for a particular purpose, absence of defects
+or errors, accuracy, non-infringement of intellectual property rights other than
+copyright as stated in Article 6 of this Licence.
+
+This disclaimer of warranty is an essential part of the Licence and a condition
+for the grant of any rights to the Work.
+
+8. Disclaimer of Liability
+
+Except in the cases of wilful misconduct or damages directly caused to natural
+persons, the Licensor will in no event be liable for any direct or indirect,
+material or moral, damages of any kind, arising out of the Licence or of the use
+of the Work, including without limitation, damages for loss of goodwill, work
+stoppage, computer failure or malfunction, loss of data or any commercial
+damage, even if the Licensor has been advised of the possibility of such damage.
+However, the Licensor will be liable under statutory product liability laws as
+far such laws apply to the Work.
+
+9. Additional agreements
+
+While distributing the Work, You may choose to conclude an additional agreement,
+defining obligations or services consistent with this Licence. However, if
+accepting obligations, You may act only on your own behalf and on your sole
+responsibility, not on behalf of the original Licensor or any other Contributor,
+and only if You agree to indemnify, defend, and hold each Contributor harmless
+for any liability incurred by, or claims asserted against such Contributor by
+the fact You have accepted any warranty or additional liability.
+
+10. Acceptance of the Licence
+
+The provisions of this Licence can be accepted by clicking on an icon ‘I agree’
+placed under the bottom of a window displaying the text of this Licence or by
+affirming consent in any other similar way, in accordance with the rules of
+applicable law. Clicking on that icon indicates your clear and irrevocable
+acceptance of this Licence and all of its terms and conditions.
+
+Similarly, you irrevocably accept this Licence and all of its terms and
+conditions by exercising any rights granted to You by Article 2 of this Licence,
+such as the use of the Work, the creation by You of a Derivative Work or the
+Distribution or Communication by You of the Work or copies thereof.
+
+11. Information to the public
+
+In case of any Distribution or Communication of the Work by means of electronic
+communication by You (for example, by offering to download the Work from a
+remote location) the distribution channel or media (for example, a website) must
+at least provide to the public the information requested by the applicable law
+regarding the Licensor, the Licence and the way it may be accessible, concluded,
+stored and reproduced by the Licensee.
+
+12. Termination of the Licence
+
+The Licence and the rights granted hereunder will terminate automatically upon
+any breach by the Licensee of the terms of the Licence.
+
+Such a termination will not terminate the licences of any person who has
+received the Work from the Licensee under the Licence, provided such persons
+remain in full compliance with the Licence.
+
+13. Miscellaneous
+
+Without prejudice of Article 9 above, the Licence represents the complete
+agreement between the Parties as to the Work.
+
+If any provision of the Licence is invalid or unenforceable under applicable
+law, this will not affect the validity or enforceability of the Licence as a
+whole. Such provision will be construed or reformed so as necessary to make it
+valid and enforceable.
+
+The European Commission may publish other linguistic versions or new versions of
+this Licence or updated versions of the Appendix, so far this is required and
+reasonable, without reducing the scope of the rights granted by the Licence. New
+versions of the Licence will be published with a unique version number.
+
+All linguistic versions of this Licence, approved by the European Commission,
+have identical value. Parties can take advantage of the linguistic version of
+their choice.
+
+14. Jurisdiction
+
+Without prejudice to specific agreement between parties,
+
+- any litigation resulting from the interpretation of this License, arising
+ between the European Union institutions, bodies, offices or agencies, as a
+ Licensor, and any Licensee, will be subject to the jurisdiction of the Court
+ of Justice of the European Union, as laid down in article 272 of the Treaty on
+ the Functioning of the European Union,
+
+- any litigation arising between other parties and resulting from the
+ interpretation of this License, will be subject to the exclusive jurisdiction
+ of the competent court where the Licensor resides or conducts its primary
+ business.
+
+15. Applicable Law
+
+Without prejudice to specific agreement between parties,
+
+- this Licence shall be governed by the law of the European Union Member State
+ where the Licensor has his seat, resides or has his registered office,
+
+- this licence shall be governed by Belgian law if the Licensor has no seat,
+ residence or registered office inside a European Union Member State.
+
+Appendix
+
+‘Compatible Licences’ according to Article 5 EUPL are:
+
+- GNU General Public License (GPL) v. 2, v. 3
+- GNU Affero General Public License (AGPL) v. 3
+- Open Software License (OSL) v. 2.1, v. 3.0
+- Eclipse Public License (EPL) v. 1.0
+- CeCILL v. 2.0, v. 2.1
+- Mozilla Public Licence (MPL) v. 2
+- GNU Lesser General Public Licence (LGPL) v. 2.1, v. 3
+- Creative Commons Attribution-ShareAlike v. 3.0 Unported (CC BY-SA 3.0) for
+ works other than software
+- European Union Public Licence (EUPL) v. 1.1, v. 1.2
+- Québec Free and Open-Source Licence — Reciprocity (LiLiQ-R) or Strong
+ Reciprocity (LiLiQ-R+).
+
+The European Commission may update this Appendix to later versions of the above
+licences without producing a new version of the EUPL, as long as they provide
+the rights granted in Article 2 of this Licence and protect the covered Source
+Code from exclusive appropriation.
+
+All other changes or additions to this Appendix require the production of a new
+EUPL version.
A => Project.toml +12 -0
@@ 1,12 @@
+name = "Earley"
+uuid = "98d8810e-6867-4d91-bca0-8798dfcfe9b1"
+authors = ["Lukas Himbert <lukas@2.71828.eu>"]
+version = "1.0.0"
+
+[deps]
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[compat]
+julia = "1.0"
+DataStructures = "0.18"
A => README.md +152 -0
@@ 1,152 @@
+Earley.jl
+=========
+
+Parse context-free languages with Earley's algorithm.
+
+Examples
+--------
+
+```julia
+julia> grammar = Grammar([ # Decimal numbers with an optional sign
+ (:Number => [:Unsigned], identity),
+ (:Number => [Match.OneOf("+-"), :Unsigned], (c, n) -> (c == '+') ? n : -n),
+ (:Unsigned => [Match.Digit()], d -> d - '0'),
+ (:Unsigned => [:Unsigned, Match.Digit()], (n, d) -> 10n + (d-'0')),
+ ]);
+
+julia> parse(grammar, "-12")
+-12
+```
+
+```julia
+julia> grammar = Grammar([ # s-expressions
+ (:sexpr => [:par_open, :values, :par_close], (_, values, _) -> tuple(values...)),
+ (:value => [:identifier], identity),
+ (:value => [:sexpr], identity),
+ (:values => [], () -> []),
+ (:values => [:ws, :values, :ws, :value, :ws], (_, vs, _, v, _) -> push!(vs, v)),
+ (:identifier => [Match.Letter()], c -> string(c)),
+ (:identifier => [:identifier, Match.Letter()], (i, c) -> i * c),
+ (:par_open => [:ws, '(', :ws], (_, _, _) -> nothing),
+ (:par_close => [:ws, ')', :ws], (_, _, _) -> nothing),
+ (:ws => [], () -> nothing),
+ (:ws => [Match.Space(), :ws], (_, _) -> nothing),
+ ]);
+
+julia> parse(grammar, "(abc (def ghi) (j))")
+("abc", ("def", "ghi"), ("j",))
+```
+
+```julia
+julia> g = Grammar([ # A simple arithmetic grammar with mixed associativity.
+ (:expression => [:sum], identity),
+ (:expression => [:product], identity),
+ (:sum => [:sum, '+', :product], (e1,_,e2) -> Expr(:call, :+, e1, e2)),
+ (:sum => [:sum, '-', :product], (e1,_,e2) -> Expr(:call, :-, e1, e2)),
+ (:sum => [:product], identity),
+ (:product => [:factor], identity),
+ (:product => [:product, '*', :factor], (e1,_,e2) -> Expr(:call, :*, e1, e2)),
+ (:factor => [:number], identity),
+ (:factor => [:power], identity),
+ (:factor => ['(', :expression, ')'], (_,e,_) -> e),
+ (:number => [Match.Digit()], c -> c-'0'),
+ (:power => [:factor, '^', :factor], (e1,_,e2) -> Expr(:call, :^, e1, e2)),
+ ]);
+
+julia> parse(g, "1+2-3+4")
+:(((1 + 2) - 3) + 4)
+
+julia> parse(g, "2*3^4^(5+6)*7")
+:((2 * 3 ^ (4 ^ (5 + 6))) * 7)
+
+julia> parse(g, "1-2*3^4+5")
+:((1 - 2 * 3 ^ 4) + 5)
+```
+
+```julia
+julia> grammar = CFG([ # An even number of 'a' characters
+ :A => [:A, :A],
+ :A => ['a', 'a']
+ :A => [],
+ ]);
+
+julia> recognize(grammar, "aaa")
+false
+
+julia> recognize(grammar, "aaaa")
+true
+```
+
+
+Overview
+--------
+
+This package provides the following:
+
+* `CFG`, a datatype for modeling context-free grammars.
+
+* `Grammar`, a datatype for modeling context-free grammars and semantic actions associated with each production rule; i.e. a grammar with synthesized attributes.
+
+* `recognize(grammar, input)`, a function that can tell for any grammar and any input, whether the input belongs to the language defined by the grammar.
+
+* `parse(grammar, input)`, a function that can parse a given input and return either a parse tree, or a value computed through semantic actions.
+
+* `matches`, a function that matches input tokens against terminals listed in the production rules.
+
+* `Matches`, various predefined token classes.
+
+For detailed information, see the respective Julia docstrings.
+
+
+Compatibility
+-------------
+
+`Earley.jl` follows [semanting versioning v2.0.0](https://semver.org/).
+The current version is 1.0.0.
+
+This package works with Julia version 1.6.7 (the current LTS) and above.
+It should also work for Julia version 1.0.
+
+It depends on the [DataStructures](https://github.com/JuliaCollections/DataStructures.jl) package.
+
+
+Releases
+--------
+
+### Version 1.0.0
+
+Initial public version
+
+
+Bugs, Caveats, TODO
+-------------------
+
+- Performance has only been a minor consideration during the development of this package.
+ Some of the included algorithms have asymptotically faster alternatives which are not implemented here.
+
+- There is no support for repeated or optional terms, such as `A ::= 'a' *` from EBNF.
+ It's up to the user to translate constructs such as this into the form required for recognition/parsing.
+
+- The parser does not support cyclic grammars.
+ (The recognizer does.)
+ It seems feasible to add support for cyclic grammars in principle, but it would require a lot of effort and the payoff would be questionable.
+
+- In the case of ambiguous languages, the parser can only return one parse tree, not the whole parse forest.
+
+- There is no support for reporting partial parses or likely fixes in the case of minor syntax errors.
+
+- Error messages for incorrect grammars may be hard to decipher.
+
+
+See also
+--------
+
+A [tutorial by Loup Vaillant](https://loup-vaillant.fr/tutorials/earley-parsing/) has been helpful to the author in understanding the principles of Earley parsing.
+
+
+Copyright
+---------
+
+Ⓒ Lukas Himbert 2022
+
+Licensed under the [EUPL-1.2-or-later](https://joinup.ec.europa.eu/collection/eupl/eupl-text-eupl-12).
A => src/Earley.jl +752 -0
@@ 1,752 @@
+"""`Earley`
+
+`Earley` contains a recognizer and parser for context-free grammars.
+
+Context-free grammars are modeled using the `CFG` datatype (preferred for
+the recognizer) or the `Grammar` datatype (preferred for the parser). The
+`Grammar` type includes a semantic action for each production rule of the
+grammar.
+
+The module provides the following functionality:
+
+* `recognize(grammar, input)` checks if the `input` matches the language
+ described by the `grammar`.
+
+* `parse(grammar :: Grammar, input)` parses the `input` and computes the
+ semantic actions on the parse tree.
+
+* `parse(grammar :: CFG, input)` parses the `input` and returns a `ParseTree`.
+
+* `matches(class, token)` is used by the algorithm to match a token (in the
+ input) against a terminal/token class (in the right-hand side of a
+ production rule). Some terminals are predefined in the `Match` submodule.
+
+`recognize` supports all context-free grammars.
+
+`parse` supports most practical grammars, including those with left-recursion,
+right-recursion, mixed recursion, and ambiguity. However, cyclic grammars are
+not supported as these may generate an infinite parse tree for a finite input.
+"""
+module Earley
+
+using DataStructures: Stack
+using DataStructures: Queue, enqueue!, dequeue!
+using DataStructures: OrderedSet
+
+include("Match.jl")
+using .Match
+
+import Base.parse
+
+export CFG
+export Grammar
+export matches
+export Match
+export recognize
+
+
+
+### CFG datatype
+
+"""`CFG{T}(rules[, start_symbol])`
+
+A context-free grammar with terminals of type `T` (usually `Any`). If no
+`start_symbol` is given, the lhs of the first rule in `rules` is assumed.
+
+`rules` is an array of production rules in the form of tuples `(A => rhs)`,
+where `A` is a nonterminal (a `Symbol`) and `rhs` is an array of terminals (of
+any type) and nonterminals.
+
+Examples
+========
+
+```
+CFG{Char}([
+ :A => ['a'],
+ :A => [:A, :A]
+]);
+```
+is an ambiguous grammar that matches any (nonzero) number of 'a' characters.
+
+```
+CFG{String}([
+ :Block => ["{}"],
+ :Block => ["if()", :Block],
+ :Block => ["if()", :Block, "else", :Block],
+]);
+```
+is a grammar that matches a trivial C-like language with "dangling else"
+ambiguity. Unlike the previous example, input tokens are strings rather than
+Chars.
+
+```
+CFG([
+ :Number => [Match.Digit()],
+ :Number => [Match.Digit(), :Number]
+]);
+```
+matches an unsigned decimal number. The paramater `T` is assumed to be
+`Any`.
+"""
+struct CFG{T}
+ rules :: Vector{Pair{Symbol,Vector{Union{Symbol,T}}}}
+ start_symbol :: Symbol
+ function CFG{T}(rules, start_symbol) where {T}
+ return new{T}([lhs => collect(Union{Symbol,T}, rhs) for (lhs, rhs) in rules], start_symbol)
+ end
+end
+
+CFG{T}(rules) where {T} = CFG{T}(rules, first(rules)[1])
+
+# Assume tokens to be of type Any unless requested otherwise.
+CFG(rules) = CFG{Any}(rules, first(rules)[1])
+CFG(rules, start_symbol) = CFG{Any}(rules, start_symbol)
+
+
+
+### Grammar datatype
+
+"""`Grammar{T}(rules_and_actions[; start_symbol])`
+
+A context-free grammar with terminals of type `T` (usually `Any`), as well as
+a list of semantic actions (i.e. functions). `start_symbol` is an optional
+argument specifying the start symbol (that needs to match the whole input). If
+no value is supplied, the left-hand side of the first production rule is
+assumed.
+
+The list of rules and actions should be an iterable of tuples
+`(lhs => rhs, action)`, where `lhs => rhs` is a production rule of a context
+free grammar as described in the documentation for `CFG`. In short, `lhs`
+should be a `Symbol` (nonterminal) and `rhs` should be an array of `Symbol`s
+and terminals (which may be compared to the input using `matches`). The
+`action` should be a function taking one argument for each terms on the
+`rhs` of the rule and returning a single value.
+
+In formal terms, `Grammar` describes an attribute grammar with synthesized
+attributes, and the semantic actions are closures which define the attributes.
+
+Example
+=======
+
+```
+julia> g_num = Grammar([
+ (:Number => [:Unsigned], n -> n),
+ (:Number => ['+', :Unsigned], (_, n) -> n),
+ (:Number => ['-', :Unsigned], (_, n) -> -n),
+ (:Unsigned => [Match.Digit()], d -> (d - '0')),
+ (:Unsigned => [:Unsigned, Match.Digit()], (n, d) -> 10n + (d - '0'))
+]);
+```
+matches a decimal integer (with optional sign) and computes its value.
+"""
+struct Grammar{T}
+ cfg :: CFG{T}
+ actions :: Vector{Function}
+
+ function Grammar{T}(g, start_symbol::Symbol) where {T}
+ new{T}(CFG{T}((rule for (rule, _) in g), start_symbol), Function[action for (_, action) in g])
+ end
+end
+
+function Grammar{T}(g) where {T}
+ ((start_symbol, _), _) = first(g)
+ return Grammar{T}(g, start_symbol)
+end
+
+Grammar(g) = Grammar{Any}(g)
+Grammar(g, s::Symbol) = Grammar{Any}(g, s)
+
+
+
+### Earley Item datatype
+
+"""`Item(start, rule, dot)`
+
+Earley parser item, i.e. a compact representation of a partial parse.
+
+* `start` is the position in the token stream where the partial parse begins,
+ i.e. `start` is one if the parse starts at the first input token, two if it
+ starts at the second input token, etc.
+* `rule` is the index of the rule being parsed in the list of rules, i.e. one
+ for the first rule, two for the second rule, etc.
+* `dot` is the position of the 'dot' (in the common abstract notation of the
+ algorithm), i.e. which sub-parse we're expecting next. The value is one if
+ we need to try the first sub-parse, two if we need to try the second
+ sub-parse, and equal to one plus the number of terms on the right-hand side
+ if the sub-parse has been completed successfully.
+"""
+struct Item
+ start :: Int
+ rule :: Int
+ dot :: Int
+end
+
+
+### recognizer
+
+"""`recognize(grammar::CFG, input)`
+
+Returns true if the input matches the given `grammar`, false otherwise. The
+`input` shall be an array of tokens (e.g. Char, UInt8, String, …).
+
+Terminal input tokens are matched against token classes using `match`. The
+default implemantation compares equality (i.e. if a token is present in the
+right hand side of a production rule, that token must be matched in the input
+for the rule to match), but `match` can be overloaded to create custom token
+classes for convenience. For predefined classes, see `Earley.Match`.
+
+There is no restriction on the `grammar`, except that it must be context-free.
+
+Examples
+--------
+
+```
+julia> recognize(
+ CFG([
+ :Par => ['(', :Par, ')'],
+ :Par => []
+ ]),
+ "(()))"
+ )
+false
+```
+
+```
+julia> recognize(
+ CFG([
+ :Num => ['0', 'x', :Digits],
+ :Digits => [Match.HexDigit()],
+ :Digits => [Match.HexDigit(), :Digits],
+ ]),
+ "0xc0ffee"
+ )
+true
+```
+"""
+function recognize(grammar :: CFG, input) :: Bool
+ successful(item) = let
+ (lhs, rhs) = grammar.rules[item.rule]
+ return item.start == 1 && lhs == grammar.start_symbol && item.dot > length(rhs)
+ end
+ return any(successful, last(chart(grammar :: CFG, input)))
+end
+
+
+"""`recognize(grammar :: Grammar, input)`
+
+Returns true if the input matches the given grammar, false otherwise. Semantic
+actions associated with the production rules are ignored.
+"""
+function recognize(grammar :: Grammar, input) :: Bool
+ recognize(grammar.cfg, input)
+end
+
+
+"""`chart(grammar :: CFG, input)`
+
+Core implementation of the Earley algorithm.
+
+Computes an Earley chart (list of attempted partial parses). This works for
+any context-free `grammar` (as described in `CFG`) and `input`. `input` may be
+an arbitrary iterator.
+"""
+function chart(grammar :: CFG, input)
+ # The array `items` holds the partial parses in form of Earley items (struct `Item`).
+ # The first entry is an array with all (attempted) complete parses,
+ # the second entry is an array with all sub-parses after the first token has been consumed,
+ # the third entry is an array with all sub-parses after the second token has been consumed,
+ # and so on.
+ items = [OrderedSet{Item}() for _ in 1:length(input)+1]
+
+ null = nullables(grammar)
+
+ # We start parsing from the start symbol
+ for (i, (lhs, _)) in enumerate(grammar.rules)
+ if lhs == grammar.start_symbol
+ push!(items[1], Item(1, i, 1))
+ end
+ end
+
+ # Continue all possible partial parses, one token at a time
+ for (i, token) in enumerate(input)
+ for item in items[i]
+ (lhs, rhs) = grammar.rules[item.rule]
+ if item.dot > length(rhs)
+ complete!(items, i, grammar, item)
+ else
+ if rhs[item.dot] isa Symbol
+ predict!(items, i, grammar, item, null)
+ elseif matches(rhs[item.dot], token)
+ scan!(items, i, grammar, item)
+ else
+ # partial parse failed, can't continue
+ end
+ end
+ end
+ end
+
+ # final predict/complete after input has been consumed
+ for item in last(items)
+ (lhs, rhs) = grammar.rules[item.rule]
+ if item.dot > length(rhs)
+ complete!(items, length(items), grammar, item)
+ elseif rhs[item.dot] isa Symbol
+ predict!(items, length(items), grammar, item, null)
+ else
+ # we're stuck
+ end
+ end
+
+ return items
+end
+
+function complete!(items, current, grammar, item)
+ (completed_term, completed_term_rhs) = grammar.rules[item.rule]
+ @assert item.dot > length(completed_term_rhs)
+
+ for it in items[item.start]
+ (lhs, rhs) = grammar.rules[it.rule]
+ if it.dot <= length(rhs) && rhs[it.dot] == completed_term
+ push!(items[current], Item(it.start, it.rule, it.dot+1))
+ end
+ end
+end
+
+function predict!(items, current, grammar, item, nullableset)
+ (lhs, rhs) = grammar.rules[item.rule]
+ @assert rhs[item.dot] isa Symbol
+
+ if rhs[item.dot] in nullableset
+ # we know that the current nonterminal can produce the empty string, so we can already advance the 'dot' over it.
+ push!(items[current], Item(item.start, item.rule, item.dot+1))
+ end
+
+ for (i, (term, _)) in enumerate(grammar.rules)
+ if term == rhs[item.dot]
+ push!(items[current], Item(current, i, 1))
+ end
+ end
+end
+
+function scan!(items, current, grammar, item)
+ push!(items[current+1], Item(item.start, item.rule, item.dot+1))
+end
+
+
+"""`nullables(grammar)`
+
+Given a `grammar` (as described in `recognize`), return an array of nullable
+nonterminals, i.e. nonterminals that can produce the empty string.
+"""
+function nullables(grammar :: CFG{T}) where {T}
+ # TODO: currently this function has worst case time complexity O(length(grammar.rules)^3). A better technique is described in https://github.com/jeffreykegler/kollos/blob/master/notes/misc/loup2.md
+ null = Set{Symbol}()
+ # keep adding nullables until no further ones are found
+ while true
+ got_one = false
+ for (lhs, rhs) in grammar.rules
+ if (!(lhs in null)) && all(term -> term in null, rhs)
+ push!(null, lhs)
+ got_one = true
+ end
+ end
+ if !got_one
+ break
+ end
+ end
+ return null
+end
+
+nullables(grammar :: Grammar) = nullables(grammar.cfg)
+
+
+
+### Parser datatypes
+
+"""`CompletedItem(start, stop, rule)`
+
+Compact representation of a completed partial parse.
+
+* `start` is the position in the token stream where the partial parse begins,
+ i.e. `start` is one if the parse starts at the first token, etc.
+* `stop` is the position in the token stream where the partial parse ends,
+ i.e. `stop` is one if the parse stops before the first token, two if it
+ stops after the first token, three if it stops after the second token, etc.
+* `rule` is the index of the rule being parsed in the list of rules, i.e. one
+ for the first rule, two for the second rule, etc.
+ The value `0` may be used as a sentinel to mark a nonterminal where
+ appropriate.
+"""
+struct CompletedItem
+ start :: Int
+ stop :: Int
+ rule :: Int # May be `0` iff it corresponds to a terminal on the RHS of a production rule.
+end
+
+
+"""`ParseTree`
+
+Data structure representing a parse tree, or a node therein. The following
+members may be accessed:
+
+- `rule`: Index of the rule corresponding to this node in the tree. Values 1
+ through `length(rules)` are used for actual rules, `0` is used for
+ matched terminals.
+- `start`: Start index of the subset of the input represented by this node.
+- `stop`: One plus the last index of the subset of the input represented by
+ this node. If `stop` == `start`, the node matches an empty subset. If
+ `rule == 0` (i.e. the node corresponds to a terminal), `stop = start+1`,
+ and `start` is the index of the terminal in the input.
+- `children :: Vector{ParseTree}`: A ParseTree (node) for each term on the
+ right-hand side of the production `rule`, in order. For terminals, this
+ will be empty.
+"""
+struct ParseTree
+ rule :: Int # May be `0` iff it corresponds to a terminal on the RHS of a production rule.
+ start :: Int
+ stop :: Int
+ children :: Vector{ParseTree}
+end
+
+
+
+### Parser
+
+"""`process_chart_for_parser(grammar::CFG, chart)`
+
+Processes the chart (list of partial parses) for later use by the parser. The
+core Earley algorithm returns a list of partial parses, some of which are
+complete, and some of which are not. The parser only needs completed partial
+parses and also needs (or at least prefers) them indexed differently.
+
+This function takes a list of lists of Earley items where the outer list
+implicitely stores the _last_ token consumed and returns a list of lists of
+Earley items where every item corresponds to a _completed_ partial parse and
+the outer list implicitely stores the _first_ token consumed. Moreover, the
+inner lists are sorted by rule, from smallest rule index to largest.
+"""
+function process_chart_for_parser(grammar :: CFG, chart)
+ completed = [Vector{CompletedItem}() for _ in 1:length(chart)]
+ for (stop, items) in enumerate(chart)
+ for item in items
+ (lhs, rhs) = grammar.rules[item.rule]
+ if item.dot > length(rhs)
+ push!(completed[item.start], CompletedItem(item.start, stop, item.rule))
+ end
+ end
+ end
+ for items in completed
+ sort!(items; by=item->item.rule)
+ end
+ return completed
+end
+
+"""`parse(grammar::Grammar, input)`
+
+Parse the given `input` according to the production rules and semantic actions
+in `Grammar`, returning the result of the semantic action corresponding to the
+rule that matches the whole input.
+
+If the `grammar` is ambiguous, the parser will always use the first matching
+rule to resolve the ambiguity.
+
+Cyclic grammars are not supported as these can produce an infinite parse tree
+for a finite input. Ambiguity, left-recursion and/or right-recursion do not
+generally problem for the parser. All unambiguous context-free grammars are
+supported.
+
+Examples
+========
+
+```
+julia> g_num = Grammar([
+ (:Number => [:Unsigned], n -> n),
+ (:Number => ['+', :Unsigned], (_, n) -> n),
+ (:Number => ['-', :Unsigned], (_, n) -> -n),
+ (:Unsigned => [Match.Digit()], d -> (d - '0')),
+ (:Unsigned => [:Unsigned, Match.Digit()], (n, d) -> 10n + (d - '0'))
+]);
+```
+matches a decimal integer (with optional sign) and computes its value:
+```
+julia> parse(g_num, "42")
+42
+
+julia> parse(g_num, "-3141")
+-3141
+```
+
+```
+julia> g_num = Grammar{String}([
+ (:Block => ["{}"], _ -> ()),
+ (:Block => ["if()", :Block], (args...) -> args),
+ (:Block => ["if()", :Block, "else", :Block], (args...) -> args),
+]);
+```
+is a trivial C-like grammar with "dangling else" ambiguity. The input is
+a list of `String`s rather than a string (of `Char`s):
+```
+julia> parse(g_num, ["if()", "if()", "{}", "else", "{}"])
+("if()", ("if()", (), "else", ()))
+```
+
+```
+julia> g = Grammar([
+ (:A => [:A], identity),
+ (:A => ['a'], identity),
+ ]);
+```
+is a cyclic grammar and is unsupported.
+"""
+function Base.parse(grammar :: Grammar, input)
+ checkgrammar(grammar)
+ completed_chart = process_chart_for_parser(grammar.cfg, chart(grammar.cfg, input))
+ for item in first(completed_chart)
+ if item.stop > length(input) && grammar.cfg.rules[item.rule][1] == grammar.cfg.start_symbol
+ return do_parse(grammar, input, completed_chart, item)
+ end
+ end
+ error("no parse")
+end
+
+
+"""`parse(grammar :: CFG, input)`
+
+Parse a given `input`, returning a `ParseTree`.
+
+The method `parse(::Grammar, input)` should be preferred to this one in most
+cases. The same restrictions on the `grammar` apply to both methods.
+"""
+function Base.parse(grammar :: CFG, input)
+ checkgrammar(grammar)
+ completed_chart = process_chart_for_parser(grammar, chart(grammar, input))
+ for item in first(completed_chart)
+ if item.stop > length(input) && grammar.rules[item.rule][1] == grammar.start_symbol
+ return do_parse(grammar, input, completed_chart, item)
+ end
+ end
+ error("no parse")
+end
+
+
+function do_parse(grammar :: CFG, input, chart, item)
+ if item.rule == 0
+ return ParseTree(item.rule, item.start, item.stop, ParseTree[])
+ else
+ return ParseTree(item.rule, item.start, item.stop, [do_parse(grammar, input, chart, it) for it in decompose(grammar, input, chart, item)])
+ end
+end
+
+function do_parse(grammar :: Grammar, input, chart, item)
+ if item.rule == 0
+ return input[item.start]
+ else
+ return grammar.actions[item.rule]((do_parse(grammar, input, chart, it) for it in decompose(grammar.cfg, input, chart, item))...)
+ end
+end
+
+
+"""`finditems(grammar, chart, term, start, stop)`
+
+Return an iterable of all valid parses of the nonterminal `term` starting at
+`start` and ending before `stop`. `chart` must be a processed set of completed
+earley items as returned by `process_chart_for_parser()`.
+
+The items are sorted by rule index, then by length.
+"""
+function finditems(grammar, chart, term, start, stop)
+ result = CompletedItem[]
+ if start > stop
+ return result
+ end
+ for item in chart[start]
+ (lhs, _) = grammar.rules[item.rule]
+ if item.stop ≤ stop && lhs == term
+ push!(result, item)
+ end
+ end
+ sort!(result, lt=(it1, it2)->it1.rule < it2.rule || it1.stop < it2.stop, alg=Base.Sort.DEFAULT_STABLE)
+ return result
+end
+
+
+"""`decompose(grammar::CFG, input, chart, item)`
+
+Given a completed Earley `item`, return a list of completed items, with each
+element corresponding to the right-hand side of the `item.rule` in the
+`grammar`.
+"""
+function decompose(grammar :: CFG, input, chart, item) :: Vector{CompletedItem}
+ (lhs, rhs) = grammar.rules[item.rule]
+
+ if isempty(rhs)
+ return CompletedItem[]
+ end
+
+ # The workstack is a list of items to be considered for every term on the right-hand side of the production rule.
+ # The bottom-most element is a list of possible items for the first rule on the rhs, the one above it is a list of possible items for the second rule on the rhs, etc.
+ workstack = Stack{Queue{CompletedItem}}()
+
+ # `items` is a list of possible completed items for every term on the rhs so far. Parses are matched greedily, one term at a time, until a match is definitely known to fail.
+ # In that case, we back up one term and try the next candidates.
+ items = Stack{CompletedItem}()
+
+ # Populate the initial workstack
+ if first(rhs) isa Symbol
+ push!(workstack, Queue{CompletedItem}())
+ for item in finditems(grammar, chart, first(rhs), item.start, item.stop)
+ enqueue!(first(workstack), item)
+ end
+ elseif matches(first(rhs), input[item.start])
+ push!(workstack, Queue{CompletedItem}())
+ enqueue!(first(workstack), CompletedItem(item.start, item.start+1, 0))
+ else
+ error("This should never happen")
+ end
+
+ # Find appropriate items, one at a time.
+ while !isempty(workstack)
+ @assert length(workstack) == length(items) + 1
+ if length(items) == length(rhs) && first(items).stop == item.stop
+ return collect(Iterators.reverse(items))
+ end
+ if !isempty(first(workstack)) && length(items) < length(rhs)
+ # try to match the next item
+ it = dequeue!(first(workstack))
+ if it.stop ≤ item.stop
+ push!(items, it)
+ next_candidates = Queue{CompletedItem}()
+ push!(workstack, next_candidates)
+ if length(items) < length(rhs)
+ next_term = rhs[length(items)+1]
+ if next_term isa Symbol
+ # Matching nonterminal rules
+ for i in finditems(grammar, chart, next_term, it.stop, item.stop)
+ enqueue!(next_candidates, i)
+ end
+ elseif it.stop ≤ length(input) && matches(next_term, input[it.stop])
+ # Matching terminal
+ enqueue!(next_candidates, CompletedItem(it.stop, it.stop+1, 0))
+ end
+ end
+ end
+ else
+ # backtrack
+ pop!(workstack)
+ pop!(items)
+ end
+ end
+ error("No solution - this should never happen")
+end
+
+
+### check grammar
+"""`checkgrammar(grammar)`
+
+Check if a given grammar can be parsed meaningfully. If the grammar is ok,
+nothing will be returned and no action will be performed.
+
+If any term appearing on the right-hand side of a production has no rule with
+that term appearing on the left-hand side of a production, a warning will be
+emitted.
+
+If the grammar is cyclic (allows for a derivation A => A for any nonterminal
+A, transitively), an `ErrorException` will be raised.
+"""
+function checkgrammar(grammar :: CFG)
+ # TODO: check for symbols on the lhs that cannot be reached from the start symbol
+
+ # Check for "undefined" nonterminals
+ lhss = Set(lhs for (lhs, rhs) in grammar.rules)
+ rhss = Set(term for (lhs, rhs) in grammar.rules for term in rhs if term isa Symbol)
+ if !issubset(rhss, lhss)
+ @warn "Symbol does not appear on the left-hand side of any production: $(first(setdiff(rhss, lhss)))"
+ end
+
+ if iscyclic(grammar)
+ error("grammar is cyclic") # TODO: include the actual cycle in the error message
+ end
+end
+
+checkgrammar(grammar :: Grammar) = checkgrammar(grammar.cfg)
+
+
+"""`iscyclic(grammar)`
+
+Returns true iff the grammar is cyclic, i.e. allows for derivation of A ⇒* A,
+where A is a nonterminal and ⇒* means performing an arbitrary number of
+replacements according to the production rules.
+"""
+function iscyclic(grammar :: CFG)
+ ns = nullables(grammar) # TODO: avoid computing this twice
+ isnullable(term) = term in ns
+ # generate a list of replacement rules
+ replacement_rules = Dict{Symbol,Set{Symbol}}()
+ for (term1, rhs) in grammar.rules
+ replacement_rules[term1] = get(replacement_rules, term1, Set{Symbol}())
+ for (i, term2) in enumerate(rhs)
+ if term2 isa Symbol && all(isnullable, rhs[1:i-1]) && all(isnullable, rhs[i+1:end])
+ push!(replacement_rules[term1], term2)
+ end
+ end
+ end
+ # find cycles
+ for term in keys(replacement_rules)
+ visited = Set{Symbol}()
+ next = replacement_rules[term]
+ while !isempty(next)
+ if term in next
+ return true
+ end
+ union!(visited, next)
+ next = setdiff(union((get(replacement_rules, t, Set{Symbol}()) for t in visited)...), visited)
+ end
+ end
+ return false
+end
+
+iscyclic(grammar :: Grammar) = iscyclic(grammar.cfg)
+
+
+### Debugging helper functions
+
+"""`printchart(grammar, chart)`
+
+Print an earley chart (list of partial parses) to stdout in a somewhat human-
+readable format.
+"""
+function printchart(grammar, chart)
+ printchart!(Base.stdout, grammar, chart)
+ for (i, items) in enumerate(chart)
+ println("(At token $i):")
+ for item in items
+ (lhs, rhs) = grammar.rules[item.rule]
+ print(" [Rule $(item.rule)] $lhs ⇒ ")
+ print(join(repr.(rhs[1:item.dot-1]), " "))
+ print(" • ")
+ print(join(repr.(rhs[item.dot:end]), " "))
+ println(" (starting at $(item.start))")
+ end
+ end
+end
+
+"""`printparsetree(grammar, input, tree)`
+
+Print a `ParseTree` to stdout in a somewhat readable form, with
+each line corresponding to a node and indentation matching depth.
+"""
+function printparsetree(grammar, input, t; depth=0)
+ # rule start stop
+ if t.rule ≠ 0
+ lhs, rhs = grammar.rules[t.rule]
+ println(" "^depth * "$lhs ⇒ " * join(repr.(rhs), " ") * ", from $(t.start) to $(t.stop)")#input: $(input[t.start:t.stop])")
+ for c in t.children
+ printparsetree(grammar, input, c; depth=depth+1)
+ end
+ else
+ println(" "^depth * "Terminal '$(input[t.start])' at $(t.start)")
+ @assert isempty(t.children)
+ end
+end
+
+end # module
A => src/Match.jl +144 -0
@@ 1,144 @@
+"""`Match`
+
+This submodule contains several datatypes that represent certain classes of
+tokens. Using these datatypes in the grammar instead of enumerating symbols is
+convenient for all but the most basic grammars.
+
+The following datatypes for unicode characters are available:
+
+* `ASCII` - matches ASCII characters
+* `Space` - matches Unicode whitespace
+* `Digit` - matches a decimal digit (0 through 9)
+* `HexDigit` - matches a hexadecimal digit (0 through 9 and 'a' through 'f')
+* `Letter` - matches a Unicode letter
+* `Lower` - matches a Unicode lower case character
+* `Upper` - matches a Unicode upper case character
+* `Print` - matches a Unicode printable character
+
+Moreover, the following datatypes are available for all types of tokens:
+
+* `AnyToken` - matches any token regardless of type or value
+* `OneOf` - matches a token (class) from a predefined list
+* `Predicate` - matches a token if that token fulfills a given predicate
+"""
+module Match
+
+export OneOf
+export ASCII, Space, Digit, Letter, Lower, Upper, Print, AnyToken
+export matches
+
+"""`matches(class, token)`
+
+Returns true if `token` belongs to the given `class` of tokens. The default
+implementation compares equality, i.e.
+`matches(class, token) = token == class`.
+
+For context, see also: `recognize`, `parse`. For a list of predefined token
+classes, see `Match`.
+"""
+function matches(a, b)
+ a == b
+end
+
+"""`OneOf(alternatives)`
+
+Creates a token class that matches any one of the given alternatives. For
+more information see `matches`.
+"""
+struct OneOf{T}
+ alternatives :: Vector{T}
+ OneOf(as) = OneOf(collect(as))
+ OneOf(as :: Vector) = new{eltype(as)}(as)
+end
+
+function matches(as :: OneOf, b)
+ any(a -> matches(a, b), as.alternatives)
+end
+
+"""`ASCII()`
+
+Creates a token class that matches any ASCII character.
+"""
+struct ASCII end
+
+matches(_ :: ASCII, token) = isascii(token)
+
+"""`Space()`
+
+Creates a token class that matches any whitespace token (i.e. one for which
+`isspace` returns true).
+"""
+struct Space end
+
+matches(_ :: Space, token) = isspace(token)
+
+"""`Digit()`
+
+Creates a token class that matches the digits '0' through '9'.
+"""
+struct Digit end
+
+matches(_ :: Digit, token) = isdigit(token)
+
+"""`Letter()`
+
+Creates a token class that matches a unicode letter.
+"""
+struct Letter end
+
+matches(_ :: Letter, token) = isletter(token)
+
+"""`Lower()`
+
+Creates a token class that matches a lower-case unicode letter.
+"""
+struct Lower end
+
+matches(_ :: Lower, token) = islowercase(token)
+
+"""`Upper()`
+
+Creates a token class that matches an upper-case unicode letter.
+"""
+struct Upper end
+
+matches(_ :: Upper, token) = isuppercase(token)
+
+"""`Print()`
+
+Creates a token class that matches a printable letter (including whitespace).
+"""
+struct Print end
+
+matches(_ :: Print, token) = isprint(token)
+
+"""`HexDigit()`
+
+Creates a token class that matches a hexadecimal digit, i.e. the digits '0'
+through '9' or latin letters 'a' through 'f' (in upper and lower case
+variants).
+"""
+struct HexDigit end
+
+matches(_ :: HexDigit, token) = isxdigit(token)
+
+"""`AnyToken`
+
+Creates a token class that matches any token.
+"""
+struct AnyToken end
+
+matches(_ :: AnyToken, _) = true
+
+"""`Predicate(p)`
+
+Creates a token class that matches any token for which the given predicate `p`
+is true.
+"""
+struct Predicate{P}
+ p :: P
+end
+
+matches(pred :: Predicate, t) = pred.p(t)
+
+end # Match
A => test/json.jl +72 -0
@@ 1,72 @@
+module JSON
+
+using DataStructures
+using Earley
+
+# This is a JSON grammar translated from augmented BNF in RFC 4627.
+# It should not be assumed to be correct and is only used to test the Earley parser.
+grammar = Grammar([
+ # JSON text
+ (:JSON => [:Object], identity),
+ (:JSON => [:Array], identity),
+
+ # Structural characters
+ (Symbol("Begin Array") => [:WS, '[', :WS], (_,_,_) -> nothing),
+ (Symbol("Begin Object") => [:WS, '{', :WS], (_,_,_) -> nothing),
+ (Symbol("End Array") => [:WS, ']', :WS], (_,_,_) -> nothing),
+ (Symbol("End Object") => [:WS, '}', :WS], (_,_,_) -> nothing),
+ (Symbol("Name Separator") => [:WS, ':', :WS], (_,_,_) -> nothing),
+ (Symbol("Value Separator") => [:WS, ',', :WS], (_,_,_) -> nothing),
+ (:WS => [], () -> nothing),
+ (:WS => [:WS, Match.Space()], (_,_) -> nothing),
+
+ # Values
+ (:Value => [:Object], identity),
+ (:Value => [:Array], identity),
+ (:Value => [:Number], identity),
+ (:Value => [:String], identity),
+ (:Value => collect("false"), (_...) -> false),
+ (:Value => collect("null"), (_...) -> nothing),
+ (:Value => collect("true"), (_...) -> true),
+
+ # Objects
+ (:Object => [Symbol("Begin Object"), :Members, Symbol("End Object")], (_, members, _) -> OrderedDict(members)),
+ (:Members => [], () -> Pair{Any,Any}[]),
+ (:Members => [:Member], m -> Pair{Any,Any}[m]),
+ (:Members => [:Members, Symbol("Value Separator"), :Member], (ms,_,m) -> vcat(ms, [m])),
+ (:Member => [:String, Symbol("Name Separator"), :Value], (s,_,v) -> (s=>v)),
+
+ # Arrays
+ (:Array => [Symbol("Begin Array"), :Values, Symbol("End Array")], (_,vs,_) -> vs),
+
+ (:Values => [], () -> Any[]),
+ (:Values => [:Value], (v) -> Any[v]),
+ (:Values => [:Values, Symbol("Value Separator"), :Value], (vs,_,v) -> vcat(vs, [v])),
+
+ # Numbers
+ (:Number => [Symbol("Optional Minus"), :Int, Symbol("Optional Fractional Part"), Symbol("Optional Exponent")], (s,x,f,e) -> s * parse(BigFloat, "$x.$(f)") * 10^e),
+ (Symbol("Optional Minus") => [], () -> 1),
+ (Symbol("Optional Minus") => ['-'], (_) -> -1),
+ (:Int => [Match.Digit()], (c) -> string(c)),
+ (:Int => [Match.OneOf("123456789"), :Digits], (c, ds) -> c * ds),
+ (Symbol("Optional Fractional Part") => [], () -> "0"),
+ (Symbol("Optional Fractional Part") => ['.', :Digits], (_,ds) -> ds),
+ (:Digits => [Match.Digit()], (c) -> string(c)),
+ (:Digits => [Match.Digit(), :Digits], (c,ds) -> c * ds),
+ (Symbol("Optional Exponent") => [], () -> 0),
+ (Symbol("Optional Exponent") => [Match.OneOf("eE"), Symbol("Optional Sign"), :Digits], (_,s,ds) -> s * parse(BigFloat, ds)),
+ (Symbol("Optional Sign") => [], () -> 1),
+ (Symbol("Optional Sign") => ['+'], (_) -> 1),
+ (Symbol("Optional Sign") => ['-'], (_) -> -1),
+
+ # Strings
+ (:String => ['"', :Chars, '"'], (_, s, _) -> s),
+ (:Chars => [], () -> ""),
+ (:Chars => [:Chars, :Char], (cs,c) -> cs * c),
+ (:Char => [Match.Predicate(c -> (0x20 ≤ Int(c) ≤ 0x21) || (0x23 ≤ Int(c) ≤ 0x5b) || (0x5d ≤ Int(c) ≤ 0x10ffff))], identity),
+ (:Char => [:Escaped], c -> string(c)),
+ (:Escaped => ['\\', Match.OneOf("\"\\/bfnrt")], (_,c) -> unescape_string("\\$c")),
+ (:Escaped => ['\\', 'u', Match.HexDigit(), Match.HexDigit(), Match.HexDigit(), Match.HexDigit()], (_,_,cs...) -> unescape_string("\\u" * String(collect(cs)))),
+])
+
+end
A => test/runtests.jl +580 -0
@@ 1,580 @@
+using Test
+using Earley
+include("json.jl")
+
+@testset "nullables()" begin
+ g1 = CFG([
+ :A => ['a'],
+ :A => ['a', :A],
+ :B => ['b'],
+ :B => [:B, 'b']
+ ])
+ @test Earley.nullables(g1) == Set([])
+
+ g2 = CFG([
+ :A => [],
+ :B => [:A, 'a'],
+ :C => [:A, 'c'],
+ :C => [:A, :A],
+ ])
+ @test Earley.nullables(g2) == Set([:A, :C])
+
+ g3 = CFG([
+ :A => [:D],
+ :B => [:A],
+ :C => [:B],
+ :D => [:C],
+ :D => []
+ ])
+ @test Earley.nullables(g3) == Set([:A, :B, :C, :D])
+
+ g4 = CFG([
+ :A => [:A]
+ ])
+ @test Earley.nullables(g4) == Set([])
+
+ g5 = CFG([
+ :A => [:D, :C, :B],
+ :B => [:A, :A],
+ :C => [:A, :B, :D],
+ :D => [:D],
+ :D => [:C, :C, :B],
+ :A => []
+ ])
+ @test Earley.nullables(g5) == Set([:A, :B])
+
+ g6 = CFG([
+ :A => [],
+ :A => [:A, 'a'],
+ :B => [:A],
+ :B => [:B, 'b'],
+ :C => [:A, 'c', :B],
+ :C => [:B, 'c', :A],
+ :D => [:B],
+ :D => [:C],
+ :D => [:D, 'd']
+ ])
+ @test Earley.nullables(g6) == Set([:A, :B, :D])
+end
+
+@testset "Match" begin
+ @testset "OneOf" begin
+ @test matches(Match.OneOf("0123456789"), '0')
+ @test matches(Match.OneOf("0123456789"), '3')
+ @test matches(Match.OneOf("0123456789"), '8')
+ @test !matches(Match.OneOf("0123456789"), 'x')
+ @test !matches(Match.OneOf("0123456789"), "nine")
+
+ @test matches(Match.OneOf(["if", "true", "false", "else"]), "if")
+ @test matches(Match.OneOf(["if", "true", "false", "else"]), "true")
+ @test !matches(Match.OneOf(["if", "true", "false", "else"]), "bool")
+ end
+
+ @testset "ASCII" begin
+ @test matches(Match.ASCII(), 'a')
+ @test matches(Match.ASCII(), 'X')
+ @test matches(Match.ASCII(), '\r')
+ @test matches(Match.ASCII(), ' ')
+ @test matches(Match.ASCII(), '.')
+ @test matches(Match.ASCII(), '0')
+ @test matches(Match.ASCII(), '8')
+ @test !matches(Match.ASCII(), 'α')
+ @test !matches(Match.ASCII(), 'Σ')
+ @test !matches(Match.ASCII(), 'す')
+ @test !matches(Match.ASCII(), '🙈')
+ end
+
+ @testset "Space" begin
+ @test !matches(Match.Space(), 'a')
+ @test !matches(Match.Space(), 'X')
+ @test matches(Match.Space(), '\r')
+ @test matches(Match.Space(), ' ')
+ @test !matches(Match.Space(), '.')
+ @test !matches(Match.Space(), '0')
+ @test !matches(Match.Space(), '8')
+ @test !matches(Match.Space(), 'α')
+ @test !matches(Match.Space(), 'Σ')
+ @test !matches(Match.Space(), 'す')
+ @test !matches(Match.Space(), '🙈')
+ end
+
+ @testset "Digit" begin
+ @test !matches(Match.Digit(), 'a')
+ @test !matches(Match.Digit(), 'X')
+ @test !matches(Match.Digit(), '\r')
+ @test !matches(Match.Digit(), ' ')
+ @test !matches(Match.Digit(), '.')
+ @test matches(Match.Digit(), '0')
+ @test matches(Match.Digit(), '8')
+ @test !matches(Match.Digit(), 'α')
+ @test !matches(Match.Digit(), 'Σ')
+ @test !matches(Match.Digit(), 'す')
+ @test !matches(Match.Digit(), '🙈')
+ end
+
+ @testset "Letter" begin
+ @test matches(Match.Letter(), 'a')
+ @test matches(Match.Letter(), 'X')
+ @test !matches(Match.Letter(), '\r')
+ @test !matches(Match.Letter(), ' ')
+ @test !matches(Match.Letter(), '.')
+ @test !matches(Match.Letter(), '0')
+ @test !matches(Match.Letter(), '8')
+ @test matches(Match.Letter(), 'α')
+ @test matches(Match.Letter(), 'Σ')
+ @test matches(Match.Letter(), 'す')
+ @test !matches(Match.Letter(), '🙈')
+ end
+
+ @testset "Lower" begin
+ @test matches(Match.Lower(), 'a')
+ @test !matches(Match.Lower(), 'X')
+ @test !matches(Match.Lower(), '\r')
+ @test !matches(Match.Lower(), ' ')
+ @test !matches(Match.Lower(), '.')
+ @test !matches(Match.Lower(), '0')
+ @test !matches(Match.Lower(), '8')
+ @test matches(Match.Lower(), 'α')
+ @test !matches(Match.Lower(), 'Σ')
+ @test !matches(Match.Lower(), 'す')
+ @test !matches(Match.Lower(), '🙈')
+ end
+
+ @testset "Upper" begin
+ @test !matches(Match.Upper(), 'a')
+ @test matches(Match.Upper(), 'X')
+ @test !matches(Match.Upper(), '\r')
+ @test !matches(Match.Upper(), ' ')
+ @test !matches(Match.Upper(), '.')
+ @test !matches(Match.Upper(), '0')
+ @test !matches(Match.Upper(), '8')
+ @test !matches(Match.Upper(), 'α')
+ @test matches(Match.Upper(), 'Σ')
+ @test !matches(Match.Upper(), 'す')
+ @test !matches(Match.Upper(), '🙈')
+ end
+
+ @testset "Print" begin
+ @test matches(Match.Print(), 'a')
+ @test matches(Match.Print(), 'X')
+ @test !matches(Match.Print(), '\r')
+ @test matches(Match.Print(), ' ')
+ @test matches(Match.Print(), '.')
+ @test matches(Match.Print(), '0')
+ @test matches(Match.Print(), '8')
+ @test matches(Match.Print(), 'α')
+ @test matches(Match.Print(), 'Σ')
+ @test matches(Match.Print(), 'す')
+ @test matches(Match.Print(), '🙈')
+ end
+
+ @testset "HexDigit" begin
+ @test matches(Match.HexDigit(), 'a')
+ @test !matches(Match.HexDigit(), 'X')
+ @test !matches(Match.HexDigit(), '\r')
+ @test !matches(Match.HexDigit(), ' ')
+ @test !matches(Match.HexDigit(), '.')
+ @test matches(Match.HexDigit(), '0')
+ @test matches(Match.HexDigit(), '8')
+ @test !matches(Match.HexDigit(), 'α')
+ @test !matches(Match.HexDigit(), 'Σ')
+ @test !matches(Match.HexDigit(), 'す')
+ @test !matches(Match.HexDigit(), '🙈')
+ end
+
+ @testset "AnyToken" begin
+ @test matches(Match.AnyToken(), 'a')
+ @test matches(Match.AnyToken(), 'X')
+ @test matches(Match.AnyToken(), '\r')
+ @test matches(Match.AnyToken(), ' ')
+ @test matches(Match.AnyToken(), '.')
+ @test matches(Match.AnyToken(), '0')
+ @test matches(Match.AnyToken(), '8')
+ @test matches(Match.AnyToken(), 'α')
+ @test matches(Match.AnyToken(), 'Σ')
+ @test matches(Match.AnyToken(), 'す')
+ @test matches(Match.AnyToken(), '🙈')
+ @test matches(Match.AnyToken(), "false")
+ @test matches(Match.AnyToken(), 25)
+ end
+
+ @testset "Predicate" begin
+ p(c) = isletter(c) || isdigit(c)
+ M = Match.Predicate(p)
+ @test matches(M, 'a')
+ @test matches(M, 'X')
+ @test !matches(M, '\r')
+ @test !matches(M, ' ')
+ @test !matches(M, '.')
+ @test matches(M, '0')
+ @test matches(M, '8')
+ @test matches(M, 'α')
+ @test matches(M, 'Σ')
+ @test matches(M, 'す')
+ @test !matches(M, '🙈')
+ end
+
+ @testset "Advanced OneOf" begin
+ @test matches(Match.OneOf([Match.Lower(), Match.Digit()]), 'a')
+ @test matches(Match.OneOf([Match.Lower(), Match.Digit()]), '0')
+ @test matches(Match.OneOf([Match.Lower(), Match.Digit()]), '9')
+ @test matches(Match.OneOf([Match.Lower(), Match.Digit()]), 'x')
+ @test matches(Match.OneOf([Match.Lower(), Match.Digit()]), 'α')
+ @test !matches(Match.OneOf([Match.Lower(), Match.Digit()]), 'A')
+ @test !matches(Match.OneOf([Match.Lower(), Match.Digit()]), 'Z')
+ @test !matches(Match.OneOf([Match.Lower(), Match.Digit()]), '!')
+
+ @test matches(Match.OneOf([Match.OneOf(["true", "false"]), "bool"]), "true")
+ @test matches(Match.OneOf([Match.OneOf(["true", "false"]), "bool"]), "bool")
+ @test !matches(Match.OneOf([Match.OneOf(["true", "false"]), "bool"]), "int")
+ end
+end
+
+@testset "recognize(::CFG)" begin
+ grammar_par1 = CFG([
+ :Par => [], # matches empty string
+ :Par => ['(', :Par, ')'],
+ ])
+ @test recognize(grammar_par1, "")
+ @test recognize(grammar_par1, "()")
+ @test !recognize(grammar_par1, "(")
+ @test !recognize(grammar_par1, ")")
+ @test !recognize(grammar_par1, ")(")
+ @test !recognize(grammar_par1, "(12)")
+ @test recognize(grammar_par1, "((()))")
+ @test !recognize(grammar_par1, "()()")
+ @test recognize(grammar_par1, "(((((((((((((((((((((((((((((())))))))))))))))))))))))))))))")
+ @test !recognize(grammar_par1, "((((((((((((((((((((((((((((())))))))))))))))))))))))))))))")
+ @test !recognize(grammar_par1, "(((((((((((((((((((((((((((((()))))))))))))))))))))))))))))")
+
+ grammar_par2 = CFG([
+ :Par => [],
+ :Par => ['(', :Par, ')'],
+ :Par => ['[', :Par, ']'],
+ ])
+ @test recognize(grammar_par2, "")
+ @test recognize(grammar_par2, "()")
+ @test recognize(grammar_par2, "[]")
+ @test !recognize(grammar_par2, "][")
+ @test !recognize(grammar_par2, "12")
+ @test recognize(grammar_par2, "(([()]))")
+ @test recognize(grammar_par2, "([[([(())])]])")
+
+ # left-recursive grammar matching an even number of 'a' characters
+ grammar_a1 = CFG([
+ :A => [],
+ :A => [:A, 'a', 'a']
+ ])
+ @test recognize(grammar_a1, "")
+ @test !recognize(grammar_a1, "a")
+ @test recognize(grammar_a1, "aa")
+ @test !recognize(grammar_a1, "aaa")
+ @test recognize(grammar_a1, "aaaa")
+
+ # right-recursive grammar matching an even number of 'a' characters
+ grammar_a1 = CFG([
+ :A => [],
+ :A => ['a', 'a', :A]
+ ])
+ @test recognize(grammar_a1, "")
+ @test !recognize(grammar_a1, "a")
+ @test recognize(grammar_a1, "aa")
+ @test !recognize(grammar_a1, "aaa")
+ @test recognize(grammar_a1, "aaaa")
+
+ # a simultaneously left-recursive, right-recursive, and ambiguous grammar
+ grammar_a = CFG([
+ :A => ['a'],
+ :A => [:A, :A]
+ ])
+ @test !recognize(grammar_a, "")
+ @test recognize(grammar_a, "a")
+ @test !recognize(grammar_a, "ab")
+ @test recognize(grammar_a, "aa")
+ @test recognize(grammar_a, "aaa")
+ @test recognize(grammar_a, "aaaa")
+ @test recognize(grammar_a, "aaaaa")
+ @test recognize(grammar_a, "aaaaaa")
+
+ grammar_cyclic = CFG([
+ :A => [:A]
+ ])
+ @test !recognize(grammar_cyclic, "")
+ @test !recognize(grammar_cyclic, "a")
+ @test !recognize(grammar_cyclic, "abc")
+ @test !recognize(grammar_cyclic, "1234567890")
+
+ grammar_nullable = CFG([
+ :A => ['a'],
+ :A => [:B, :A],
+ :B => [],
+ :B => [:A]
+ ])
+ @test recognize(grammar_nullable, "a")
+ @test recognize(grammar_nullable, "aaa")
+
+ grammar_ab = CFG([
+ :AB => [Match.OneOf("ab")],
+ :AB => [:AB, :AB]
+ ])
+ @test !recognize(grammar_ab, "")
+ @test recognize(grammar_ab, "ab")
+ @test recognize(grammar_ab, "ba")
+ @test recognize(grammar_ab, "aa")
+ @test recognize(grammar_ab, "bbbbbb")
+ @test recognize(grammar_ab, "abababaabbba")
+ @test recognize(grammar_ab, "abababaabbbaababbabababababababbababaaaaabbabbbbbb")
+ @test !recognize(grammar_ab, "acb")
+
+ grammar_decimal = CFG([
+ :Decimal => [:Nonnegative],
+ :Decimal => [Match.OneOf("+-"), :Nonnegative],
+ :Nonnegative => [Match.Digit()],
+ :Nonnegative => [Match.OneOf("123456789"), :Digits],
+ :Digits => [],
+ :Digits => [:Digits, Match.Digit()],
+ ])
+ @test !recognize(grammar_decimal, "")
+ @test recognize(grammar_decimal, "0")
+ @test recognize(grammar_decimal, "1")
+ @test recognize(grammar_decimal, "2")
+ @test recognize(grammar_decimal, "-0")
+ @test recognize(grammar_decimal, "+5")
+ @test recognize(grammar_decimal, "12389348279824792837492")
+ @test recognize(grammar_decimal, "+12389348279824792837492")
+ @test recognize(grammar_decimal, "-12389348279824792837492")
+ @test !recognize(grammar_decimal, "09")
+ @test !recognize(grammar_decimal, "023")
+ @test !recognize(grammar_decimal, "0x42")
+end
+
+@testset "recognize(::CFG) large grammars" begin
+ json = JSON.grammar.cfg
+
+ @test Earley.nullables(json) == Set([:WS, :Members, :Values, Symbol("Optional Minus"), Symbol("Optional Fractional Part"), Symbol("Optional Exponent"), Symbol("Optional Sign"), :Chars])
+
+ @test recognize(json, "{}")
+ @test recognize(json, "[]")
+ @test recognize(json, "[true]")
+ @test recognize(json, """[""]""")
+ @test recognize(json, """["green\\nしろ"]""")
+ @test recognize(json, "[1]")
+ @test recognize(json, "[-0]")
+ @test recognize(json, "[123456789012345678901234567890]")
+ @test recognize(json, "[2e6]")
+ @test recognize(json, "[3.5]")
+ @test recognize(json, "[-12.0]")
+ @test recognize(json, "[true, null, false]")
+ @test recognize(json, """{"twelve": null}""")
+ @test recognize(json, """{ "α" : 7.3e-3\n,\n"R": 0.12820 }""")
+ @test !recognize(json, "12")
+ @test !recognize(json, "\"yellow\"")
+ @test !recognize(json, "{true}")
+ @test !recognize(json, "[FALSE]")
+end
+
+@testset "recognize(::Grammar)" begin
+ grammar_par1 = Grammar([
+ (:Par => [], () -> error()),
+ (:Par => [:Par, :Par], (_,_) -> error()),
+ (:Par => ['(', :Par, ')'], (_,_,_) -> error()),
+ ])
+ @test recognize(grammar_par1, "")
+ @test recognize(grammar_par1, "()")
+ @test !recognize(grammar_par1, "(")
+ @test !recognize(grammar_par1, ")")
+ @test !recognize(grammar_par1, ")(")
+ @test !recognize(grammar_par1, "(12)")
+ @test recognize(grammar_par1, "((()))")
+ @test recognize(grammar_par1, "()()")
+ @test recognize(grammar_par1, "(((()()(()))))")
+ @test !recognize(grammar_par1, "((((()(()))))")
+ @test recognize(grammar_par1, "(((((((((((((((((((((((((((((())))))))))))))))))))))))))))))")
+ @test !recognize(grammar_par1, "((((((((((((((((((((((((((((())))))))))))))))))))))))))))))")
+ @test !recognize(grammar_par1, "(((((((((((((((((((((((((((((()))))))))))))))))))))))))))))")
+end
+
+@testset "checkgrammar()" begin
+ g1 = CFG([
+ :A => [:A],
+ :A => []
+ ])
+ @test_throws ErrorException Earley.checkgrammar(g1)
+
+ g2 = CFG([
+ :A => [],
+ :A => [:A]
+ ])
+ @test_throws ErrorException Earley.checkgrammar(g2)
+
+ g3 = CFG([
+ :A => [:B],
+ :B => [:A],
+ :A => ['a'],
+ :B => ['b'],
+ ])
+ @test_throws ErrorException Earley.checkgrammar(g3)
+
+ g4 = CFG([
+ :A => [:A, 'a'],
+ :A => ['a', :A],
+ :A => [:A],
+ :A => ['a']
+ ])
+ @test_throws ErrorException Earley.checkgrammar(g4)
+
+ g5 = CFG([
+ :A => [:B, :B],
+ :B => ['a', 'b'],
+ :B => [:A, :B],
+ :A => [],
+ :B => [:A, :A, :A],
+ ])
+ @test_throws ErrorException Earley.checkgrammar(g5)
+
+ g6 = Grammar([
+ (:A => [:A], identity),
+ (:A => ['a'], identity)
+ ])
+ @test_throws ErrorException Earley.checkgrammar(g6)
+
+ g7 = Grammar([
+ (:A => [:B], identity),
+ (:B => [:A], identity),
+ (:A => ['a'], identity),
+ (:B => ['b'], identity)
+ ])
+ @test_throws ErrorException Earley.checkgrammar(g7)
+end
+
+@testset "parse(::CFG)" begin
+ grammar_par = CFG([
+ :Par => [],
+ :Par => ['(', :Par, ')'],
+ :Par => ['[', :Par, ']'],
+ ])
+ @testset "parens, \"\"" begin
+ tree = parse(grammar_par, "")
+ @test tree.rule == 1 && tree.start == 1 && tree.stop == 1 && isempty(tree.children)
+ end
+ @testset "parens, \"()\"" begin
+ tree = parse(grammar_par, "()")
+ @test tree.rule == 2 && tree.start == 1 && tree.stop == 3 && length(tree.children) == 3
+ (child1, child2, child3) = tree.children
+ @test child1.rule == 0 && child1.start == 1 && child1.stop == 2 && isempty(child1.children)
+ @test child2.rule == 1 && child2.start == 2 && child2.stop == 2 && isempty(child2.children)
+ @test child3.rule == 0 && child3.start == 2 && child3.stop == 3 && isempty(child3.children)
+ end
+ @testset "parens, \"([])\"" begin
+ tree = parse(grammar_par, "([])")
+ @test tree.rule == 2 && tree.start == 1 && tree.stop == 5 && length(tree.children) == 3
+ (child1, child2, child3) = tree.children
+ @test child1.rule == 0 && child1.start == 1 && child1.stop == 2 && isempty(child1.children)
+ @test child2.rule == 3 && child2.start == 2 && child2.stop == 4 && length(child2.children) == 3
+ @test child3.rule == 0 && child3.start == 4 && child3.stop == 5 && isempty(child3.children)
+ (child21, child22, child23) = child2.children
+ @test child21.rule == 0 && child21.start == 2 && child21.stop == 3 && isempty(child21.children)
+ @test child22.rule == 1 && child22.start == 3 && child22.stop == 3 && isempty(child22.children)
+ @test child23.rule == 0 && child23.start == 3 && child23.stop == 4 && isempty(child23.children)
+ end
+end
+
+@testset "parse(::Grammar)" begin
+
+ @testset "left/right recursive" begin
+ gl = Grammar([
+ (:A => [], () -> []),
+ (:A => [:A, 'a'], (e,a) -> [e, a])
+ ])
+ gr = Grammar([
+ (:A => [], () -> []),
+ (:A => ['a', :A], (a,e) -> [a, e])
+ ])
+ glr = Grammar([
+ (:A => [], () -> []),
+ (:A => [:A, 'a', :A], (e1,a,e2) -> [e1, a, e2])
+ ])
+
+ @test parse(gl, "") == []
+ @test parse(gr, "") == []
+ @test parse(glr, "") == []
+
+ @test parse(gl, "a") == [[], 'a']
+ @test parse(gr, "a") == ['a', []]
+ @test parse(glr, "a") == [[], 'a', []]
+
+ @test parse(gl, "aa") == [[[], 'a'], 'a']
+ @test parse(gr, "aa") == ['a', ['a', []]]
+ @test parse(glr, "aa") == [[], 'a', [[], 'a', []]]
+
+ @test parse(gl, "aaaaa") == [[[[[[], 'a'], 'a'], 'a'], 'a'], 'a']
+ @test parse(gr, "aaaaa") == ['a', ['a', ['a', ['a', ['a', []]]]]]
+ #@test parse(glr, "aaaaa")
+ end
+
+ @testset "dangling else" begin
+ # Two grammars for a trivial C-like syntax with dangling else ambiguity.
+ # Depending on the order of the third and fourth rule, the ambiguity is resolved differently.
+ ra = [
+ (:Block => ["{}"], (_) -> []),
+ (:Block => [:If], e -> e),
+ (:If => ["if()", :Block], (_,b) -> ["if()", b]),
+ (:If => ["if()", :Block, "else", :Block], (_,c,_,a) -> ["if()", c, a])
+ ]
+ g_inner = Grammar{String}(ra)
+ g_outer = Grammar{String}([ra[1], ra[2], ra[4], ra[3]])
+
+ @test parse(g_inner, ["{}"]) == []
+ @test parse(g_outer, ["{}"]) == []
+
+ @test parse(g_inner, ["if()", "{}"]) == ["if()", []]
+ @test parse(g_outer, ["if()", "{}"]) == ["if()", []]
+
+ @test parse(g_inner, ["if()", "{}", "else", "{}"]) == ["if()", [], []]
+ @test parse(g_outer, ["if()", "{}", "else", "{}"]) == ["if()", [], []]
+
+ @test parse(g_inner, ["if()", "if()", "{}"]) == ["if()", ["if()", []]]
+ @test parse(g_outer, ["if()", "if()", "{}"]) == ["if()", ["if()", []]]
+
+ # This is where the ambiguity comes into play
+ @test parse(g_inner, ["if()", "if()", "{}", "else", "{}"]) == ["if()", ["if()", [], []]]
+ @test parse(g_outer, ["if()", "if()", "{}", "else", "{}"]) == ["if()", ["if()", []], []]
+ end
+
+ @testset "parse(::Grammar) large grammars" begin
+ json = JSON.grammar
+ @test parse(json, "{}") == Dict()
+ @test parse(json, "[]") == []
+ @test parse(json, "[123456789012345678901234567890]") == [123456789012345678901234567890]
+ @test parse(json, "[-0]") == [0]
+ @test parse(json, "[true, false, null]") == [true, false, nothing]
+ @test parse(json, """{"twelve": null}""") == Dict("twelve" => nothing)
+ @test parse(json, """{"\\r": [{}], "\\u03b1": "true"}""") == Dict("\r" => [Dict()], "α" => "true")
+ @test isapprox(parse(json, "[3.14195e5, 13e-2]"), [314195, 0.13])
+ @test parse(json, collect("""{ "α" : "7.3e-3"\n,\n"R": "0.12820" }""")) == Dict("α" => "7.3e-3", "R" => "0.12820")
+ end
+
+ @testset "mixed associativity" begin
+ g = Grammar([ # A simple arithmetic grammar with mixed associativity.
+ (:expression => [:sum], identity),
+ (:expression => [:product], identity),
+ (:sum => [:sum, '+', :product], (e1,_,e2) -> Expr(:call, :+, e1, e2)),
+ (:sum => [:sum, '-', :product], (e1,_,e2) -> Expr(:call, :-, e1, e2)),
+ (:sum => [:product], identity),
+ (:product => [:factor], identity),
+ (:product => [:product, '*', :factor], (e1,_,e2) -> Expr(:call, :*, e1, e2)),
+ (:factor => [:number], identity),
+ (:factor => [:power], identity),
+ (:factor => ['(', :expression, ')'], (_,e,_) -> e),
+ (:number => [Match.Digit()], c -> c-'0'),
+ (:power => [:factor, '^', :factor], (e1,_,e2) -> Expr(:call, :^, e1, e2)),
+ ]);
+
+ @test parse(g, "1-2-3") == :((1-2)-3)
+ @test parse(g, "2*3*4") == :((2*3)*4)
+ @test parse(g, "2^3^4") == :(2^(3^4))
+ @test parse(g, "1+2-3+4") == :(((1+2)-3)+4)
+ @test parse(g, "2*3^4^(5+6)*7") == :((2*3^(4^(5+6)))*7)
+ @test parse(g, "1-2*3^4+5") == :((1-2*3^4)+5)
+ end
+end