~boringcactus/crowbar-reference-compiler

338049020a17831e68b6a437bb038d8f10bfc45e — Melody Horn 1 year, 9 months ago d8cd8a6
bring in updates to spec
M crowbar_reference_compiler/parser.py => crowbar_reference_compiler/parser.py +143 -158
@@ 2,169 2,154 @@ from parsimonious import TokenGrammar, ParseError, IncompleteParseError  # type:

grammar = TokenGrammar(
    r"""
HeaderFile                 = HeaderFileElement+
HeaderFileElement          = IncludeStatement /
                             TypeDeclaration /
                             FunctionDeclaration

ImplementationFile         = ImplementationFileElement+
ImplementationFileElement  = HeaderFileElement /
                             FunctionDefinition

IncludeStatement           = "include" string_literal ";"

TypeDeclaration            = StructDeclaration /
                             EnumDeclaration /
                             TypedefDeclaration
StructDeclaration          = "struct" identifier "{" VariableDeclaration+ "}" ";"
EnumDeclaration            = "enum" identifier "{" EnumBody "}" ";"
EnumBody                   = (identifier ("=" Expression)? "," EnumBody) /
                             (identifier ("=" Expression)? ","?)
TypedefDeclaration         = "typedef" identifier "=" Type ";"

FunctionDeclaration        = FunctionSignature ";"
FunctionDefinition         = FunctionSignature Block
FunctionSignature          = Type identifier "(" SignatureArguments? ")"
SignatureArguments         = (Type identifier "," SignatureArguments) /
                             (Type identifier ","?)

Block                      = "{" Statement* "}"
HeaderFile <- IncludeStatement* HeaderFileElement+
HeaderFileElement <- TypeDefinition / FunctionDeclaration / VariableDefinition / VariableDeclaration

ImplementationFile <- IncludeStatement* ImplementationFileElement+
ImplementationFileElement <- TypeDefinition / VariableDefinition / FunctionDefinition

IncludeStatement <- 'include' string-literal ';'

TypeDefinition <- StructDefinition / EnumDefinition / UnionDefinition
StructDefinition <- NormalStructDefinition / OpaqueStructDefinition
NormalStructDefinition <- 'struct' identifier '{' VariableDeclaration+ '}'
OpaqueStructDefinition <- 'opaque' 'struct' identifier ';'
EnumDefinition <- 'enum' identifier '{' EnumMember (',' EnumMember)* ','? '}'
EnumMember <- identifier ('=' Expression)?
UnionDefinition <- RobustUnionDefinition / FragileUnionDefinition
RobustUnionDefinition <- 'union' identifier '{' VariableDeclaration UnionBody '}'
UnionBody <- 'switch' '(' identifier ')' '{' UnionBodySet+ '}'
UnionBodySet <- CaseSpecifier+ (VariableDeclaration / ';')
FragileUnionDefinition <- 'fragile' 'union' identifier '{' VariableDeclaration+ '}'

FunctionDeclaration <- FunctionSignature ';'
FunctionDefinition <- FunctionSignature Block
FunctionSignature <- Type identifier '(' SignatureArguments? ')'
SignatureArguments <- Type identifier (',' Type identifier)* ','?

Block <- '{' Statement* '}'
       
Statement                  = VariableDefinition /
                             VariableDeclaration /
                             IfStatement /
                             SwitchStatement /
                             WhileStatement /
                             DoWhileStatement /
                             ForStatement /
                             FlowControlStatement /
                             AssignmentStatement /
                             ExpressionStatement

VariableDefinition         = Type identifier "=" Expression ";"
VariableDeclaration        = Type identifier ";"

IfStatement                = ("if" Expression Block "else" Block) /
                             ("if" Expression Block)

SwitchStatement            = "switch" Expression "{" SwitchCase+ "}"
SwitchCase                 = (CaseSpecifier Block) /
                             ("default" Block)
CaseSpecifier              = ("case" Expression "," CaseSpecifier) /
                             ("case" Expression ","?)

WhileStatement             = "while" Expression Block
DoWhileStatement           = "do" Block "while" Expression ";"
ForStatement               = "for" VariableDefinition? ";" Expression ";" AssignmentStatementBody? Block
   
FlowControlStatement       = ("continue" ";") /
                             ("break" ";") /
                             ("return" Expression? ";")
   
AssignmentStatement        = AssignmentStatementBody ";"
AssignmentStatementBody    = (AssignmentTargetExpression "=" Expression) /
                             (AssignmentTargetExpression "+=" Expression) /
                             (AssignmentTargetExpression "-=" Expression) /
                             (AssignmentTargetExpression "*=" Expression) /
                             (AssignmentTargetExpression "/=" Expression) /
                             (AssignmentTargetExpression "%=" Expression) /
                             (AssignmentTargetExpression "&=" Expression) /
                             (AssignmentTargetExpression "^=" Expression) /
                             (AssignmentTargetExpression "|=" Expression) /
                             (AssignmentTargetExpression "++") /
                             (AssignmentTargetExpression "--")

ExpressionStatement        = Expression ";"
   
Type                       = ("const" BasicType) /
                             (BasicType "*") /
                             (BasicType "[" Expression "]") /
                             (BasicType "function" "(" (BasicType ",")* ")") /
                             BasicType
BasicType                  = "void" /
                             IntegerType /
                             ("signed" IntegerType) /
                             ("unsigned" IntegerType) /
                             "float" /
                             "double" /
                             "bool" /
                             ("struct" identifier) /
                             ("enum" identifier) /
                             ("typedef" identifier) /
                             ("(" Type ")")
IntegerType                = "char" /
                             "short" /
                             "int" /
                             "long"

AssignmentTargetExpression = identifier ATEElementSuffix*
ATEElementSuffix           = ("[" Expression "]") /
                             ("." identifier) /
                             ("->" identifier)

AtomicExpression           = identifier /
                             constant /
                             string_literal /
                             ("(" Expression ")")

ObjectExpression           = (AtomicExpression ObjectSuffix*) /
                             ArrayLiteralExpression /
                             StructLiteralExpression
ObjectSuffix               = ("[" Expression "]") /
                             ("(" CommasExpressionList? ")") /
                             ("." identifier) /
                             ("->" identifier)
CommasExpressionList       = (Expression "," CommasExpressionList) /
                             (Expression ","?)
ArrayLiteralExpression     = "{" CommasExpressionList "}"
StructLiteralExpression    = "{" StructLiteralBody "}"
StructLiteralBody          = (StructLiteralElement "," StructLiteralBody?) /
                             (StructLiteralElement ","?)
StructLiteralElement       = "." identifier "=" Expression

FactorExpression           = ("(" Type ")" FactorExpression) /
                             ("&" FactorExpression) /
                             ("*" FactorExpression) /
                             ("+" FactorExpression) /
                             ("-" FactorExpression) /
                             ("~" FactorExpression) /
                             ("!" FactorExpression) /
                             ("sizeof" FactorExpression) /
                             ("sizeof" Type) /
                             ObjectExpression

TermExpression             = FactorExpression TermSuffix*
TermSuffix                 = ("*" FactorExpression) /
                             ("/" FactorExpression) /
                             ("%" FactorExpression)

ArithmeticExpression       = TermExpression ArithmeticSuffix*
ArithmeticSuffix           = ("+" TermExpression) /
                             ("-" TermExpression)

BitwiseOpExpression        = (ArithmeticExpression "<<" ArithmeticExpression) /
                             (ArithmeticExpression ">>" ArithmeticExpression) /
                             (ArithmeticExpression "^" ArithmeticExpression) /
                             (ArithmeticExpression ("&" ArithmeticExpression)+) /
                             (ArithmeticExpression ("|" ArithmeticExpression)+) /
                             ArithmeticExpression

ComparisonExpression       = (BitwiseOpExpression "==" BitwiseOpExpression) /
                             (BitwiseOpExpression "!=" BitwiseOpExpression) /
                             (BitwiseOpExpression "<=" BitwiseOpExpression) /
                             (BitwiseOpExpression ">=" BitwiseOpExpression) /
                             (BitwiseOpExpression "<" BitwiseOpExpression) /
                             (BitwiseOpExpression ">" BitwiseOpExpression) /
                             BitwiseOpExpression

Expression                 = (ComparisonExpression ("&&" ComparisonExpression)+) /
                             (ComparisonExpression ("||" ComparisonExpression)+) /
                             ComparisonExpression
Statement <- VariableDefinition / StructureStatement / FlowControlStatement / AssignmentStatement / FragileStatement / ExpressionStatement / EmptyStatement
EmptyStatement <- ';'
FragileStatement <- 'fragile' Statement
ExpressionStatement <- Expression ';'

VariableDeclaration <- Type identifier ';'
VariableDefinition <- Type identifier '=' Expression ';'

StructureStatement <- IfStatement / SwitchStatement / WhileStatement / DoWhileStatement / ForStatement
IfStatement <- 'if' '(' Expression ')' Block ('else' Block)?
SwitchStatement <- 'switch' '(' Expression ')' '{' (CaseSpecifier / Statement)+ '}'
CaseSpecifier <- ('case' Expression ':') / ('default' ':')
WhileStatement <- 'while' '(' Expression ')' Block
DoWhileStatement <- 'do' Block 'while' '(' Expression ')' ';'
ForStatement <- 'for' '(' ForInit? ';' Expression ';' ForUpdate? ')' Block
ForInit <- ForInitializer (',' ForInitializer)* ','?
ForInitializer <- Type identifier '=' Expression
ForUpdate <- AssignmentBody (',' AssignmentBody)* ','?

FlowControlStatement <- ContinueStatement / BreakStatement / ReturnStatement
ContinueStatement <- 'continue' ';'
BreakStatement <- 'break' ';'
ReturnStatement <- 'return' Expression? ';'

AssignmentStatement <- AssignmentBody ';'
AssignmentBody <- DirectAssignmentBody / UpdateAssignmentBody / CrementAssignmentBody
DirectAssignmentBody <- Expression '=' Expression
UpdateAssignmentBody <- Expression ('+=' / '-=' / '*=' / '/=' / '%=' / '&=' / '^=' / '|=') Expression
CrementAssignmentBody <- Expression ('++' / '--')

Type <- ConstType / PointerType / ArrayType / FunctionType / BasicType
ConstType <- 'const' BasicType
PointerType <- BasicType '*'
ArrayType <- BasicType '[' Expression ']'
FunctionType <- BasicType 'function' '(' FunctionTypeArgs? ')'
FunctionTypeArgs <- BasicType (',' BasicType)* ','?
BasicType <- 'void' / 'bool' / 'float32' / 'float64' /
             'int8' / 'int16' / 'int32' / 'int64' / 'intaddr' / 'intmax' / 'intsize' /
             'uint8' / 'uint16' / 'uint32' / 'uint64' / 'uintaddr' / 'uintmax' / 'uintsize' /
             ('struct' identifier) / ('enum' identifier) / ('union' identifier) / ('(' Type ')')


AtomicExpression <- identifier / constant / 'true' / 'false' / string-literal / ('(' Expression ')')

ObjectExpression <- (AtomicExpression ObjectSuffix*) / ArrayLiteral / StructLiteral
ObjectSuffix <- ArrayIndexSuffix / FunctionCallSuffix / StructElementSuffix / StructPointerElementSuffix

ArrayIndexSuffix <- '[' Expression ']'

FunctionCallSuffix <- '(' CommasExpressionList? ')'
CommasExpressionList <- Expression (',' Expression)* ','?

StructElementSuffix <- '.' identifier

StructPointerElementSuffix <- '->' identifier

ArrayLiteral <- '{' CommasExpressionList '}'

StructLiteral <- '{' StructLiteralElement (',' StructLiteralElement)* ','? '}'
StructLiteralElement <- '.' identifier '=' Expression

FactorExpression <- CastExpression / AddressOfExpression / DerefExpression / PositiveExpression / NegativeExpression / BitwiseNotExpression / LogicalNotExpression / SizeofExpression / ObjectExpression

CastExpression <- '(' Type ')' ObjectExpression

AddressOfExpression <- '&' ObjectExpression

DerefExpression <- '*' ObjectExpression

PositiveExpression <- '+' ObjectExpression

NegativeExpression <- '-' ObjectExpression

BitwiseNotExpression <- '~' ObjectExpression

LogicalNotExpression <- '!' ObjectExpression

SizeofExpression <- ('sizeof' ObjectExpression) / ('sizeof' Type)

TermExpression <- FactorExpression TermSuffix?
TermSuffix <- ('*' FactorExpression)+ / ('/' FactorExpression)+ / ('%' FactorExpression)+

ArithmeticExpression <- TermExpression ArithmeticSuffix?
ArithmeticSuffix <- ('+' TermExpression)+ / ('-' TermExpression)+

BitwiseOpExpression <- ShiftExpression / XorExpression / BitwiseAndExpression / BitwiseOrExpression / ArithmeticExpression

ShiftExpression <- (ArithmeticExpression '<<' ArithmeticExpression) / (ArithmeticExpression '>>' ArithmeticExpression)

XorExpression <- ArithmeticExpression '^' ArithmeticExpression

BitwiseAndExpression <- ArithmeticExpression ('&' ArithmeticExpression)+

BitwiseOrExpression <- ArithmeticExpression ('|' ArithmeticExpression)+

ComparisonExpression <- EqualExpression / NotEqualExpression / LessEqExpression / GreaterEqExpression / LessThanExpression / GreaterThanExpression / BitwiseOpExpression

EqualExpression <- BitwiseOpExpression '==' BitwiseOpExpression

NotEqualExpression <- BitwiseOpExpression '!=' BitwiseOpExpression

LessEqExpression <- BitwiseOpExpression '<=' BitwiseOpExpression

GreaterEqExpression <- BitwiseOpExpression '>=' BitwiseOpExpression

LessThanExpression <- BitwiseOpExpression '<' BitwiseOpExpression

GreaterThanExpression <- BitwiseOpExpression '>' BitwiseOpExpression

LogicalOpExpression <- LogicalAndExpression / LogicalOrExpression / ComparisonExpression

LogicalAndExpression <- ComparisonExpression ('&&' ComparisonExpression)+

LogicalOrExpression <- ComparisonExpression ('||' ComparisonExpression)+

Expression <- LogicalOpExpression

identifier = "identifier"
constant = "constant"
string_literal = "string_literal"
""")
""".replace(' <- ', ' = ').replace('string-literal', 'string_literal'))


class LegibleParseError(ParseError):

M crowbar_reference_compiler/scanner.py => crowbar_reference_compiler/scanner.py +43 -17
@@ 1,5 1,5 @@
from dataclasses import dataclass
from typing import Optional, overload, List, Union
from typing import Optional, List

import regex as re  # type: ignore



@@ 24,12 24,33 @@ class GenerousTokenList(List[Token]):
            return Token('')


KEYWORD = re.compile("bool|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|fragile|function|if|include|int|long|return|short|signed|sizeof|struct|switch|typedef|unsigned|void|while")
KEYWORD = re.compile(r"""
    bool|break|
    case|char|const|continue|
    default|do|
    else|enum|
    false|float32|float64|for|fragile|function|
    if|include|int8|int16|int32|int64|intmax|intsize|
    opaque|
    return|
    sizeof|struct|switch|
    true|
    uint8|uint16|uint32|uint64|uintaddr|uintmax|uintsize|union|
    void|
    while""", re.VERBOSE)
IDENTIFIER = re.compile(r"[\p{L}\p{Pc}\p{Sk}\p{Mn}][\p{L}\p{Pc}\p{Sk}\p{Mn}\p{N}]*")
CONSTANT = re.compile(r"""([0-9_]+)|(0[bB][01_]+)|0o[0-7_]+|(0[xX][0-9a-fA-F_]+)|([0-9_]+(\.[0-9_]+|[eE][0-9_]+|\.[0-9_]+[eE][0-9_]+))|('([^\'\\]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})')""")
STRING_LITERAL = re.compile(r'''"([^\\"]|\\'|\\"|\\\\|\\r|\\n|\\t|\\0|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8})*"''')
DECIMAL_CONSTANT = re.compile(r"[0-9_]+")
BINARY_CONSTANT = re.compile(r"0[bB][01_]+")
OCTAL_CONSTANT = re.compile(r"0o[0-7_]+")
HEX_CONSTANT = re.compile(r"0[xX][0-9a-fA-F]+")
FLOAT_CONSTANT = re.compile(r"[0-9_]+\.[0-9_]+([eE][+-]?[0-9_]+)?")
HEX_FLOAT_CONSTANT = re.compile(r"0(fx|FX)[0-9a-fA-F_]+\.[0-9a-fA-F_]+[pP][+-]?[0-9_]+")

_ESCAPE_SEQUENCE = r"""\\['"\\rnt0]|\\x[0-9a-fA-F]{2}|\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}"""
CHAR_CONSTANT = re.compile(r"'([^'\\]|" + _ESCAPE_SEQUENCE + r")'")
STRING_LITERAL = re.compile(r'"([^"\\]|' + _ESCAPE_SEQUENCE + r')+"')
PUNCTUATOR = re.compile(r"->|\+\+|--|>>|<<|<=|>=|&&|\|\||[=!+\-*/%&|^]=|[\[\](){}.,+\-*/%;!&|^~><=]")
WHITESPACE = re.compile(r"[\p{Zs}\p{Cc}]+")
WHITESPACE = re.compile(r"[\p{Z}\p{Cc}]+")
COMMENT = re.compile(r"(//[^\n]*\n)|(/\*.*?\*/)", re.DOTALL)




@@ 46,20 67,25 @@ def scan(code):
        if match:
            remaining = remaining[match.end():]
            continue
        match = KEYWORD.match(remaining)
        if match:
            result.append(Token(match.group()))
            remaining = remaining[match.end():]
        kw_match = KEYWORD.match(remaining)
        id_match = IDENTIFIER.match(remaining)
        if kw_match and ((not id_match) or len(kw_match.group()) == len(id_match.group())):
            result.append(Token(kw_match.group()))
            remaining = remaining[kw_match.end():]
            continue
        match = IDENTIFIER.match(remaining)
        if match:
            result.append(Token('identifier', match.group()))
            remaining = remaining[match.end():]
        if id_match:
            result.append(Token('identifier', id_match.group()))
            remaining = remaining[id_match.end():]
            continue
        match = CONSTANT.match(remaining)
        if match:
            result.append(Token('constant', match.group()))
            remaining = remaining[match.end():]
        was_constant = False
        for constant in [DECIMAL_CONSTANT, BINARY_CONSTANT, OCTAL_CONSTANT, HEX_CONSTANT, FLOAT_CONSTANT, HEX_FLOAT_CONSTANT, CHAR_CONSTANT]:
            match = constant.match(remaining)
            if match:
                result.append(Token('constant', match.group()))
                remaining = remaining[match.end():]
                was_constant = True
                break
        if was_constant:
            continue
        match = STRING_LITERAL.match(remaining)
        if match:

M tests/test_hello_world.py => tests/test_hello_world.py +1 -1
@@ 8,7 8,7 @@ class TestHelloWorld(unittest.TestCase):
        code = r"""
include "stdio.hro";

int main() {
int32 main() {
    printf("Hello, world!\n");
    return 0;
}

M tests/test_parsing.py => tests/test_parsing.py +16 -18
@@ 5,41 5,39 @@ from crowbar_reference_compiler import parse_header, parse_implementation, scan

class TestParsing(unittest.TestCase):
    def test_basic(self):
        print(parse_header(scan("int x();")))
        print(parse_header(scan("int8 x();")))

    def test_scdoc_str(self):
        # adapted from https://git.sr.ht/~sircmpwn/scdoc/tree/master/include/str.h
        print(parse_header(scan(r"""
include "stdint.h";

struct str {
    char *str;
    typedef size_t len;
    typedef size_t size;
};
    (uint8[size])* str;
    uintsize len;
    uintsize size;
}

struct str *str_create();
void str_free(struct str *str);
void str_reset(struct str *str);
int str_append_ch(struct str *str, typedef uint32_t ch);
intsize str_append_ch(struct str *str, uint32 ch);
""")))
        # adapted from https://git.sr.ht/~sircmpwn/scdoc/tree/master/src/string.c
        print(parse_implementation(scan(r"""
include "stdlib.h";
include "stdint.h";
include "str.h";
include "unicode.h";
include "stdlib.hro";
include "stdint.hro";
include "str.hro";
include "unicode.hro";

int ensure_capacity(struct str *str, typedef size_t len) {
bool ensure_capacity(struct str *str, intsize len) {
    if (len + 1 >= str->size) {
        char *new = realloc(str->str, str->size * 2);
        (uint8[str->size * 2])* new = realloc(str->str, str->size * 2);
        if (!new) {
            return 0;
            return false;
        }
        str->str = new;
        str->size *= 2;
    }
    return 1;
    return true;
}

struct str *str_create() {


@@ 59,8 57,8 @@ void str_free(struct str *str) {
    free(str);
}

int str_append_ch(struct str *str, typedef uint32_t ch) {
    int size = utf8_chsize(ch);
intsize str_append_ch(struct str *str, uint32 ch) {
    intsize size = utf8_chsize(ch);
    if (size <= 0) {
        return -1;
    }